diff --git a/mindspore/ccsrc/backend/common/session/kernel_graph.h b/mindspore/ccsrc/backend/common/session/kernel_graph.h index 0d4c508f81e..abff236c169 100644 --- a/mindspore/ccsrc/backend/common/session/kernel_graph.h +++ b/mindspore/ccsrc/backend/common/session/kernel_graph.h @@ -50,6 +50,13 @@ struct KernelWithIndexCmp { } }; +struct SomasInfo { + // whole_block_size_ is 0 indicating that somas did not allocate memory for this graph. + size_t whole_block_size_{0}; + // offset -> aligned_size_ + std::map merged_blocks_map_; +}; + using DeviceType = device::DeviceType; using KernelMapTensor = std::map; @@ -57,6 +64,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph { public: KernelGraph() : inputs_(std::make_shared>()), + somas_info_(std::make_shared()), graph_id_(0), stream_distinction_label_(kInvalidDistincLabel), device_target_(DeviceType::kUnknown), @@ -69,6 +77,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph { KernelGraph(const KernelGraph &graph) : FuncGraph(graph) { inputs_ = graph.inputs_; + somas_info_ = graph.somas_info_; child_graph_result_ = graph.child_graph_result_; execution_order_ = graph.execution_order_; mem_reuse_exec_order_ = graph.mem_reuse_exec_order_; @@ -452,6 +461,11 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph { bool IsCommSubGraph(uint32_t id) const { return comm_sub_graph_ids_.find(id) != comm_sub_graph_ids_.end(); } void RecordNewCommSubGraphId(uint32_t id) { comm_sub_graph_ids_.insert(id); } + // somas total memory size + SomasInfo *MutableSomasInfo() const { return somas_info_.get(); } + size_t somas_whole_block_size() const { return somas_info_->whole_block_size_; } + const std::map &somas_merged_blocks_map() const { return somas_info_->merged_blocks_map_; } + private: // remove value node form graph bool RemoveValueNodeFromGraph(const ValueNodePtr &value_node); @@ -477,6 +491,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph { // members std::shared_ptr> inputs_; + std::shared_ptr somas_info_; std::vector child_graph_result_; std::vector execution_order_; std::vector mem_reuse_exec_order_; diff --git a/mindspore/ccsrc/backend/common/somas/somas.cc b/mindspore/ccsrc/backend/common/somas/somas.cc index 7cd5055c821..1bb8c78cc67 100644 --- a/mindspore/ccsrc/backend/common/somas/somas.cc +++ b/mindspore/ccsrc/backend/common/somas/somas.cc @@ -20,19 +20,16 @@ #include #include #include -#include #include +#include #include "backend/common/somas/somas_node.h" #include "backend/common/somas/somas_solver_pre.h" #include "backend/common/somas/somas_stream.h" #include "backend/common/somas/somas_tensor.h" -#ifdef ENABLE_D -#include "plugin/device/ascend/hal/device/ascend_stream_assign.h" -#endif #include "backend/common/optimizer/helper.h" -#include "utils/ms_context.h" #include "include/common/debug/common.h" +#include "include/common/debug/anf_ir_dump.h" #ifdef ENABLE_DUMP_IR #include "debug/rdr/string_recorder.h" #endif @@ -46,15 +43,16 @@ using mindspore::profiler::ascend::TensorMemory; #endif namespace mindspore { namespace somas { -constexpr auto kGapSize = 512; constexpr auto kRetryIntervalSeconds = 500; -constexpr size_t kRefNodeTensorNum = 2; +constexpr auto kRefNodeTensorNum = 2; constexpr auto kOnlyOneDestinationNode = 1; constexpr auto kOnlyTwoDestinationNode = 2; +constexpr auto kNopNodeRealInputIndex = 1; +constexpr auto kZeroAlignSize = 1; constexpr auto kGraphId = "graph_id"; constexpr auto kHashId = "hash_id"; -constexpr auto kMemOffset = "mem_offset"; +constexpr auto kReused_memory_size = "reused_memory_size"; constexpr auto kNodeSize = "node_size"; constexpr auto kTensorSize = "tensor_size"; constexpr auto kContiguousSize = "contiguous_size"; @@ -72,104 +70,203 @@ constexpr auto kLifeEnd = "life_end"; constexpr auto kOffset = "offset"; constexpr auto kCachedResultThreshold = 2000; -std::map tensor_type_name_map = {{kCommon, "Common"}, - {kOutputOnly, "OutputOnly"}, - {kWorkspace, "Workspace"}, - {kGetNextOutput, "GetNextOutput"}, - {kSummaryInput, "SummaryInput"}, - {kRefNodeInput, "RefNodeInput"}, - {kRefNodeOutput, "RefNodeOutput"}, - {kEventVirtualOutput, "EventVirtualOutput"}, - {kUnknown, "Unknown"}}; - -std::map life_long_name_map = {{kLifeLongNone, "LifeLongNone"}, - {kLifeLongGraphAll, "LifeLongGraphAll"}, - {kLifeLongGraphStart, "LifeLongGraphStart"}, - {kLifeLongGraphEnd, "LifeLongGraphEnd"}}; - -bool Somas::Allocate(const session::KernelGraph *graph) { - MS_LOG(DEBUG) << "Somas Allocate start..."; - auto ret = InitSomasTensors(graph); - if (!ret) { - MS_LOG(EXCEPTION) << "Somas Initialize Failed."; +// set somas result +void SetSomasResult(std::vector> &&output_somas_result, + std::vector> &&workspace_somas_result, AnfNode *node) { + MS_EXCEPTION_IF_NULL(node); + auto kernel_info = dynamic_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + if (!kernel_info->SetSomasResult(std::move(output_somas_result), std::move(workspace_somas_result))) { + MS_LOG(EXCEPTION) << "Node " << node->DebugString() << "set somas result fail. "; } +} + +void MergeBlocks(std::vector *block_list, std::stack *merged_blocks) { + if (block_list->empty()) { + MS_LOG(INFO) << "No block to merge."; + return; + } + std::sort(block_list->begin(), block_list->end(), [](const Block &block1, const Block &block2) { + return (block1.start_offset_ < block2.start_offset_) || + ((block1.start_offset_ == block2.start_offset_) && (block1.end_offset_ < block2.end_offset_)); + }); + merged_blocks->push(Block((*block_list)[0].start_offset_, (*block_list)[0].size_)); + for (size_t i = 1; i < block_list->size(); i++) { + Block &top = merged_blocks->top(); + auto &block = (*block_list)[i]; + if (block.start_offset_ >= top.end_offset_) { + merged_blocks->push(Block(block.start_offset_, block.size_)); + } else if (block.end_offset_ > top.end_offset_) { + top.end_offset_ = block.end_offset_; + top.size_ = top.end_offset_ - top.start_offset_; + } + } +} + +bool Somas::Assign(const session::KernelGraph &graph) { + MS_LOG(INFO) << "Start Somas Assign for graph " << graph.graph_id(); + if (graph.is_dynamic_shape()) { + MS_LOG(WARNING) << "Somas can't allocate graph with dynamic_shape now."; + return false; + } + auto ret = ConfigSomas(graph); + if (!ret) { + MS_LOG(EXCEPTION) << "Config Somas Failed."; + } + MS_LOG(INFO) << "Somas Configure success, configuration info: " + << "\nDevice Name: " << device_name_ << "\nRun by execution order: " << depend_exec_order_ + << "\nEnable debug log: " << save_debug_info_ << "\nDebug log path: " << debug_info_path_; + MS_LOG(INFO) << "Start Initialize SOMAS Model"; + + ret = InitSomasModel(graph); + if (!ret) { + MS_LOG(EXCEPTION) << "Somas modeling Failed for graph " << graph.graph_id(); + } + MS_LOG(INFO) << "End Initialize SOMAS Model"; if (tensors_list_.empty()) { - MS_LOG(INFO) << "No Tensor for Somas"; + MS_LOG(INFO) << "No Somas Tensor in graph " << graph.graph_id(); return true; } - ret = LoadSomasCache(graph); - if (ret) { - GenGraphStatisticInfo(); - return ret; + if (enable_cache_) { + ret = LoadSomasCache(graph); + if (ret) { + GenGraphStatisticInfo(); + UpdateSomasResultToGraph(graph); + DumpSomasModelInfo("somas_tensor_offset", graph.graph_id()); + MS_LOG(INFO) << "Somas Allocate end."; + return ret; + } } // Computing Conflict pairs - MS_LOG(INFO) << "Start Computing Conflict Pairs"; - ComputeConflictPairs(); - MS_LOG(INFO) << "End Computing Conflict Pairs"; + MS_LOG(INFO) << "Start Computing Conflict Matrix"; + ComputeConflictMatrix(); + MS_LOG(INFO) << "End Computing Conflict Matrix"; - ret = Assign(graph); + ret = Solve(graph); if (!ret) { MS_LOG(EXCEPTION) << "Somas Assign Failed."; } - SaveSomasResult(graph); + GenGraphStatisticInfo(); - MS_LOG(DEBUG) << "Somas Allocate end."; + if (enable_cache_) { + SaveSomasResult(graph); + } + + UpdateSomasResultToGraph(graph); + DumpSomasModelInfo("somas_tensor_offset", graph.graph_id()); + + MS_LOG(INFO) << "Somas Allocate end."; return ret; } -bool Somas::LoadSomasCache(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - MS_LOG(DEBUG) << "Somas LoadSomasCache start..."; - if (tensors_list_.size() < kCachedResultThreshold) { - MS_LOG(DEBUG) << "Tensors size (" << tensors_list_.size() << ") less than " << kCachedResultThreshold - << ", no need to load cached"; +bool Somas::Assign(const KernelGraphPtr &graph_ptr) { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + MS_EXCEPTION_IF_NULL(graph_ptr); +#ifndef ENABLE_SECURITY + auto enable_save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + if (enable_save_graphs) { + std::string file_name = "somas_input_graph_" + std::to_string(graph_ptr->graph_id()) + ".ir"; + DumpIR(file_name, graph_ptr, true, kWholeStack); + } +#endif + return Assign(*graph_ptr); +} + +size_t Somas::GetCommunicationReservedSize() const { return 0; } + +bool Somas::GetEnableCacheFlag(const session::KernelGraph &graph) const { + return graph.execution_order().size() >= kCachedResultThreshold; +} + +std::pair Somas::GetDebugConfig() const { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + auto enable_save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + auto save_graphs_path = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); + if (save_graphs_path.empty()) { + save_graphs_path = "."; + } + return std::make_pair(enable_save_graphs, save_graphs_path); +} + +std::vector> Somas::GetStreamGroupInfo(const session::KernelGraph &graph) const { + std::vector> stream_group; + return stream_group; +} + +std::map Somas::GetUnReuseNodeType(const session::KernelGraph &graph) const { + std::map node_type; + return node_type; +} + +std::map Somas::GetUnReuseNodeName(const session::KernelGraph &graph) const { + std::map name_type; + return name_type; +} + +bool Somas::ConfigSomas(const session::KernelGraph &graph) { + auto ret = Initialize(); + if (!ret) { + MS_LOG(ERROR) << "Somas Initialize failed. Please Check!!!"; return false; } + device_name_ = GetDeviceName(); + communication_gap_size_ = GetCommunicationReservedSize(); + enable_cache_ = GetEnableCacheFlag(graph); + depend_exec_order_ = GetDependExecOrderFlag(graph); + auto debug_config = GetDebugConfig(); + save_debug_info_ = debug_config.first; + debug_info_path_ = debug_config.second; + streams_groups_ = GetStreamGroupInfo(graph); + un_reuse_node_type_.clear(); + auto device_un_reuse_type = GetUnReuseNodeType(graph); + un_reuse_node_type_.insert(device_un_reuse_type.begin(), device_un_reuse_type.end()); + un_reuse_node_name_.clear(); + auto device_un_reuse_name = GetUnReuseNodeName(graph); + un_reuse_node_name_.insert(device_un_reuse_name.begin(), device_un_reuse_name.end()); + return true; +} +bool Somas::LoadSomasCache(const session::KernelGraph &graph) { + MS_LOG(DEBUG) << "Somas LoadSomasCache start..."; bool ret = CalcSomasModelHash(graph); if (ret) { std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" + - std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json"; + std::to_string(graph.graph_id()) + "_" + hash_id_ + ".json"; ret = LoadSomasResult(graph, filename); if (ret) { MS_LOG(INFO) << "Load Somas Cache file " << filename << " Successfully."; } } else { - MS_LOG(ERROR) << "Calculate somas's model hash id failed."; + MS_LOG(ERROR) << "Calculate SOMAS model hash id failed."; } MS_LOG(DEBUG) << "Somas LoadSomasCache end."; return ret; } -bool Somas::CalcSomasModelHash(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); +bool Somas::CalcSomasModelHash(const session::KernelGraph &graph) { auto model_str = SomasInfo(true); hash_id_ = std::to_string(std::hash()(model_str)); - MS_LOG(INFO) << "Graph " << graph->graph_id() << "'s SOMAS Model hash id is " << hash_id_; + MS_LOG(INFO) << "Graph " << graph.graph_id() << "'s SOMAS Model hash id is " << hash_id_; std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" + - std::to_string(graph->graph_id()) + "_" + hash_id_ + ".info"; + std::to_string(graph.graph_id()) + "_" + hash_id_ + ".info"; return Common::SaveStringToFile(filename, model_str); } -bool Somas::SaveSomasResult(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - if (tensors_list_.size() < kCachedResultThreshold) { - MS_LOG(DEBUG) << "Tensors size (" << tensors_list_.size() << ") less than " << kCachedResultThreshold - << ", no need to save result"; - return false; - } +bool Somas::SaveSomasResult(const session::KernelGraph &graph) { nlohmann::json somas_json; - somas_json[kGraphId] = graph->graph_id(); + somas_json[kGraphId] = graph.graph_id(); somas_json[kHashId] = hash_id_; - somas_json[kMemOffset] = mem_offset_; + somas_json[kReused_memory_size] = reused_memory_size_; somas_json[kNodeSize] = nodes_list_.size(); somas_json[kTensorSize] = tensors_list_.size(); somas_json[kContiguousSize] = contiguous_tensors_list_.size(); - somas_json[kRefNodeSize] = ref_node_constraints_.size(); - somas_json[kStreamSize] = streams_list_.size(); + somas_json[kRefNodeSize] = union_tensors_list_.size(); + somas_json[kStreamSize] = streams_map_.size(); somas_json[kStreamGroupSize] = streams_groups_.size(); std::vector tensors_json; for (auto &tensor : tensors_list_) { @@ -187,12 +284,48 @@ bool Somas::SaveSomasResult(const session::KernelGraph *graph) { somas_json[kTensors] = tensors_json; std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" + - std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json"; + std::to_string(graph.graph_id()) + "_" + hash_id_ + ".json"; (void)Common::SaveStringToFile(filename, somas_json.dump()); return true; } -bool Somas::LoadSomasResult(const session::KernelGraph *graph, const string &filename) { +bool Somas::UpdateSomasResultToGraph(const session::KernelGraph &graph) { + auto &execution_nodes = graph.execution_order(); + std::vector block_list; + for (auto &node : execution_nodes) { + auto kernel_mod = AnfAlgo::GetKernelMod(node); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto output_somas_result = GetNodeOutputSomasResult(node); + auto workspace_somas_result = GetNodeWorkSpaceSomasResult(node); + + for (const auto &somas_offset_aligned_size : output_somas_result) { + if (somas_offset_aligned_size.second > 0) { + block_list.emplace_back(somas_offset_aligned_size.first, somas_offset_aligned_size.second); + } + } + for (const auto &somas_offset_aligned_size : workspace_somas_result) { + if (somas_offset_aligned_size.second > 0) { + block_list.emplace_back(somas_offset_aligned_size.first, somas_offset_aligned_size.second); + } + } + + SetSomasResult(std::move(output_somas_result), std::move(workspace_somas_result), node.get()); + } + + std::stack merged_blocks; + MergeBlocks(&block_list, &merged_blocks); + session::SomasInfo *somas_info = graph.MutableSomasInfo(); + somas_info->whole_block_size_ = reused_memory_size_; + while (!merged_blocks.empty()) { + auto block = merged_blocks.top(); + merged_blocks.pop(); + somas_info->merged_blocks_map_[block.start_offset_] = block.size_; + dump_merged_blocks_.emplace_back(block.start_offset_, block.size_); + } + return true; +} + +bool Somas::LoadSomasResult(const session::KernelGraph &graph, const string &filename) { std::ifstream somas_json_fs(filename); if (!somas_json_fs.is_open()) { MS_LOG(INFO) << "Open json file: " << filename << " error, Somas Cache Missed."; @@ -220,27 +353,19 @@ bool Somas::LoadSomasResult(const session::KernelGraph *graph, const string &fil MS_LOG(WARNING) << "Verify Somas Result Failed."; return false; } - auto mem_offset = somas_json[kMemOffset]; - mem_offset_ = mem_offset; + reused_memory_size_ = somas_json[kReused_memory_size]; ret = UpdateTensorsOffset(somas_json[kTensors]); return ret; } -bool Somas::VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const { - MS_EXCEPTION_IF_NULL(graph); - auto graph_id = somas_json[kGraphId]; - auto hash_id = somas_json[kHashId]; - auto node_size = somas_json[kNodeSize]; - auto tensor_size = somas_json[kTensorSize]; - auto contiguous_size = somas_json[kContiguousSize]; - auto ref_node_size = somas_json[kRefNodeSize]; - auto stream_size = somas_json[kStreamSize]; - auto stream_group_size = somas_json[kStreamGroupSize]; - - if (graph_id != graph->graph_id()) { - MS_LOG(WARNING) << "Mismatch graph id " << graph_id << " vs " << graph->graph_id(); - return false; - } +bool Somas::VerifySomasResult(const session::KernelGraph &graph, const nlohmann::json &somas_json) const { + const auto &hash_id = somas_json[kHashId]; + const auto &node_size = somas_json[kNodeSize]; + const auto &tensor_size = somas_json[kTensorSize]; + const auto &contiguous_size = somas_json[kContiguousSize]; + const auto &ref_node_size = somas_json[kRefNodeSize]; + const auto &stream_size = somas_json[kStreamSize]; + const auto &stream_group_size = somas_json[kStreamGroupSize]; if (hash_id != hash_id_) { MS_LOG(WARNING) << "Mismatch hash id " << hash_id << " vs " << hash_id_; @@ -262,13 +387,13 @@ bool Somas::VerifySomasResult(const session::KernelGraph *graph, const nlohmann: return false; } - if (ref_node_size != ref_node_constraints_.size()) { - MS_LOG(WARNING) << "Mismatch ref node size " << ref_node_size << " vs " << ref_node_constraints_.size(); + if (ref_node_size != union_tensors_list_.size()) { + MS_LOG(WARNING) << "Mismatch ref node size " << ref_node_size << " vs " << union_tensors_list_.size(); return false; } - if (stream_size != streams_list_.size()) { - MS_LOG(WARNING) << "Mismatch stream size " << stream_size << " vs " << streams_list_.size(); + if (stream_size != streams_map_.size()) { + MS_LOG(WARNING) << "Mismatch stream size " << stream_size << " vs " << streams_map_.size(); return false; } @@ -277,136 +402,221 @@ bool Somas::VerifySomasResult(const session::KernelGraph *graph, const nlohmann: return false; } + const auto &tensors_json = somas_json[kTensors]; + for (const auto &tensor_json : tensors_json) { + const auto &tensor_id = tensor_json[kTensorId]; + const auto &size = tensor_json[kSize]; + const auto &ori_size = tensor_json[kOriSize]; + const auto &lifelong_value = tensor_json[kLifelongValue]; + const auto &life_start = tensor_json[kLifeStart]; + const auto &life_end = tensor_json[kLifeEnd]; + if (tensor_id < tensors_list_.size()) { + auto &tensor = tensors_list_[tensor_id]; + MS_EXCEPTION_IF_NULL(tensor); + if (size != tensor->aligned_size_) { + MS_LOG(WARNING) << "Mismatch size of tensor " << tensor_id << " " << size << " vs " << tensor->aligned_size_; + return false; + } + + if (ori_size != tensor->GetOriginalSize()) { + MS_LOG(WARNING) << "Mismatch original size of tensor " << tensor_id << " " << ori_size << " vs " + << tensor->GetOriginalSize(); + return false; + } + + if (lifelong_value != tensor->lifelong_value_) { + MS_LOG(WARNING) << "Mismatch lifelong value of tensor " << tensor_id << " " << lifelong_value << " vs " + << tensor->lifelong_value_; + return false; + } + + if (life_start != tensor->lifetime_.start_) { + MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_start << " vs " + << tensor->lifetime_.start_; + return false; + } + + if (life_end != tensor->lifetime_.end_) { + MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_end << " vs " + << tensor->lifetime_.end_; + return false; + } + } else { + MS_LOG(WARNING) << "Can't find tensor " << tensor_id; + return false; + } + } + return true; } bool Somas::UpdateTensorsOffset(const std::vector &tensors_json) { bool ret = true; for (auto &tensor_json : tensors_json) { - auto tensor_id = tensor_json[kTensorId]; - auto size = tensor_json[kSize]; - auto ori_size = tensor_json[kOriSize]; - auto lifelong_value = tensor_json[kLifelongValue]; - auto life_start = tensor_json[kLifeStart]; - auto life_end = tensor_json[kLifeEnd]; - auto offset = tensor_json[kOffset]; - auto iter = tensors_map_.find(tensor_id); - if (iter != tensors_map_.end()) { - MS_EXCEPTION_IF_NULL(iter->second); - if (size != iter->second->aligned_size_) { - MS_LOG(WARNING) << "Mismatch size of tensor " << tensor_id << " " << size << " vs " - << iter->second->aligned_size_; - ret = false; - break; - } - - if (ori_size != iter->second->GetOriginalSize()) { - MS_LOG(WARNING) << "Mismatch original size of tensor " << tensor_id << " " << ori_size << " vs " - << iter->second->GetOriginalSize(); - ret = false; - break; - } - - if (lifelong_value != iter->second->lifelong_value_) { - MS_LOG(WARNING) << "Mismatch lifelong value of tensor " << tensor_id << " " << lifelong_value << " vs " - << iter->second->lifelong_value_; - ret = false; - break; - } - - if (life_start != iter->second->lifetime_.start_) { - MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_start << " vs " - << iter->second->lifetime_.start_; - ret = false; - break; - } - - if (life_end != iter->second->lifetime_.end_) { - MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_end << " vs " - << iter->second->lifetime_.end_; - ret = false; - break; - } - - // verify pass, update memory offset - iter->second->offset_ = offset; - } else { - MS_LOG(WARNING) << "Can't find tensor " << tensor_id; - ret = false; - break; - } + const auto &tensor_id = tensor_json[kTensorId]; + const auto &size = tensor_json[kSize]; + const auto &offset = tensor_json[kOffset]; + auto &tensor = tensors_list_[tensor_id]; + MS_EXCEPTION_IF_NULL(tensor); + // update memory offset + tensor->offset_ = offset; + tensor->aligned_size_ = size; } return ret; } -bool Somas::InitSomasTensors(const session::KernelGraph *graph) { - MS_LOG(DEBUG) << "Somas InitSomasTensors start..."; - MS_EXCEPTION_IF_NULL(graph); - InitBasicInfo(graph); - IndependentNodeOutputProcess(graph); +bool Somas::InitSomasModel(const session::KernelGraph &graph) { + MS_EXCEPTION_IF_CHECK_FAIL(InitBasicInfoFromGraph(graph), "Init SOMAS basic info from graph failed."); +#if defined(ENABLE_DUMP_IR) && !defined(ENABLE_SECURITY) + SubModuleId module = SubModuleId::SM_OPTIMIZER; + std::string name = device_name_ + "_somas_initial_info." + std::to_string(graph.graph_id()); + (void)mindspore::RDR::RecordString(module, name, SomasInfo()); +#endif + DumpSomasModelInfo("somas_initial_info", graph.graph_id()); + + MS_EXCEPTION_IF_CHECK_FAIL(InitDevSpecControlTensors(graph), "Init device special control tensors failed."); + DumpSomasModelInfo("somas_device_control_info", graph.graph_id()); + + MS_EXCEPTION_IF_CHECK_FAIL(CommonSpecNodeProcess(graph), "Common special node process failed."); + DumpSomasModelInfo("somas_common_spec_node_process", graph.graph_id()); + + MS_EXCEPTION_IF_CHECK_FAIL(DevSpecNodeProcess(graph), "Device specify special node process failed."); + DumpSomasModelInfo("somas_device_spec_node_process", graph.graph_id()); + + UnReuseNodeProcess(graph); + UpdateContiguousTensorList(); + if (tensors_list_.empty()) { + MS_LOG(INFO) << "No Tensor from graph " << graph.graph_id(); + return true; + } + + MS_LOG(INFO) << "Created " << streams_map_.size() << " streams (" << streams_groups_.size() << " groups), " + << nodes_list_.size() << " nodes, " << tensors_list_.size() << " tensors, " << union_tensors_list_.size() + << " union tensors lists, and " << contiguous_tensors_list_.size() << " contiguous tensors lists"; + +#if defined(ENABLE_DUMP_IR) && !defined(ENABLE_SECURITY) + name = device_name_ + "_somas_pre_processed_info." + std::to_string(graph.graph_id()); + (void)mindspore::RDR::RecordString(module, name, SomasInfo()); + name = device_name_ + "_somas_offline_log." + std::to_string(graph.graph_id()); + (void)mindspore::RDR::RecordString(module, name, Offline()); +#endif + + DumpSomasModelInfo("somas_pre_processed_info", graph.graph_id()); + if (save_debug_info_) { + std::string offline_file_path = GetSaveGraphsPathName( + "/" + device_name_ + "_somas_offline_log_" + std::to_string(graph.graph_id()) + ".ir", debug_info_path_); + DumpOfflineIR(offline_file_path); + } + return true; +} + +void Somas::AddControlTensor(const SomasNodePtr &from, const SomasNodePtr &to) { + size_t control_tensor_index = control_tensors_list_.size(); + SomasTensorPtr tensor = + std::make_shared(control_tensor_index, from->GetId(), from->GetStreamId(), 0, 0, kLifeLongNone); + tensor->lifetime_.start_ = from->GetId(); + tensor->lifetime_.end_ = to->GetId(); + tensor->type_ = kControl; + tensor->destination_nodes_.insert(to->GetId()); + tensor->consumer_list_.emplace_back(to->GetId()); + from->control_output_tensors_.push_back(tensor); + to->control_input_tensors_.push_back(tensor); + to->ancestor_nodes_.insert(from); + control_tensors_list_.push_back(tensor); +} + +void Somas::AddControlTensorFromExecOrder(const session::KernelGraph &graph) { + // Loop to add control edges within each stream (node order within stream) + for (const auto &stream_kv : streams_map_) { + auto stream = stream_kv.second; + MS_EXCEPTION_IF_NULL(stream); + auto &nodes = stream->nodes_; + std::sort(nodes.begin(), nodes.end(), NodeSort); + for (size_t i = 1; i < nodes.size(); i++) { + const auto &previous_node = nodes[i - 1]; + const auto ¤t_node = nodes[i]; + MS_EXCEPTION_IF_NULL(current_node); + AddControlTensor(previous_node, current_node); + } + } + + // Loop to add control edges from end to beginning of next group + for (const auto &group : streams_groups_) { + for (size_t i = 1; i < group.size(); i++) { + size_t previous_stream = group[i - 1]; + size_t current_stream = group[i]; + + auto stream = GetSomasStream(previous_stream); + if (stream == nullptr) { + continue; + } + + auto &last_node_in_prev_stream = stream->nodes_.back(); + + stream = GetSomasStream(current_stream); + if (stream == nullptr) { + continue; + } + auto &first_node_in_cur_stream = stream->nodes_.front(); + AddControlTensor(last_node_in_prev_stream, first_node_in_cur_stream); + } + } + + // Loop to compute max destinations in each stream + mindspore::HashMap stream_max_destination_node; + // Loop to compute max destinations in each stream + for (const auto &tensor : tensors_list_) { + MS_EXCEPTION_IF_NULL(tensor); + stream_max_destination_node.clear(); + for (const auto &node_id : tensor->destination_nodes_) { + auto node = GetSomasNode(node_id); + MS_EXCEPTION_IF_NULL(node); + if (node_id > stream_max_destination_node[node->GetStreamId()]) { + stream_max_destination_node[node->GetStreamId()] = node_id; + } + } + + tensor->consumer_list_.clear(); + for (const auto &dst_map : stream_max_destination_node) { + tensor->consumer_list_.emplace_back(dst_map.second); + } + } +} + +void Somas::InitControlTensors(const session::KernelGraph &graph) { + if (depend_exec_order_) { + AddControlTensorFromExecOrder(graph); + } +} + +bool Somas::CommonSpecNodeProcess(const session::KernelGraph &graph) { #ifndef ENABLE_SECURITY SummaryInputProcess(graph); #endif RefNodeProcess(graph); - NonTaskSplitProcess(graph); - UnReuseNodeProcess(graph); - GenContiguousList(graph); - GetNextOutputProcess(graph); - - if (tensors_list_.empty()) { - MS_LOG(INFO) << "No Tensor from graph " << graph->graph_id(); - return true; - } - - MS_LOG(INFO) << "Created " << streams_list_.size() << " streams (" << streams_groups_.size() << " groups), " - << nodes_list_.size() << " nodes, " << tensors_list_.size() << " tensors, and " - << contiguous_tensors_list_.size() << " contiguous lists"; - -#ifdef ENABLE_DUMP_IR - SubModuleId module = SubModuleId::SM_OPTIMIZER; - std::string name = "somas_pre_processed_info." + std::to_string(graph->graph_id()); - (void)mindspore::RDR::RecordString(module, name, SomasInfo()); - name = "somas_offline_log." + std::to_string(graph->graph_id()); - (void)mindspore::RDR::RecordString(module, name, Offline()); -#endif - - if (save_graphs_) { - std::string file_path = GetSaveGraphsPathName( - "/somas_pre_processed_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_); - DumpSomasInfoIR(file_path); - - std::string offline_file_path = - GetSaveGraphsPathName("/somas_offline_log_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_); - DumpOfflineIR(offline_file_path); - } - MS_LOG(DEBUG) << "Somas InitSomasTensors end."; + CommunicationNodeProcess(graph); return true; } -void Somas::InitSomasStreamAndNode(const session::KernelGraph *graph) { +void Somas::InitSomasStreamAndNode(const session::KernelGraph &graph) { MS_LOG(DEBUG) << "Somas InitSomasStreamAndNode start..."; - MS_EXCEPTION_IF_NULL(graph); - std::vector kernel_cnodes; - streams_list_ = {}; + streams_map_.clear(); nodes_list_ = {}; - size_t node_index = 0; - if (graph->subgraph_multi_call()) { - kernel_cnodes = graph->mem_reuse_exec_order(); - } else { - kernel_cnodes = graph->execution_order(); - } + auto &kernel_cnodes = (graph.subgraph_multi_call()) ? graph.mem_reuse_exec_order() : graph.execution_order(); for (size_t i = 0; i < kernel_cnodes.size(); i++) { auto kernel = kernel_cnodes[i]; MS_EXCEPTION_IF_NULL(kernel); SomasStreamPtr stream; - auto stream_id = AnfAlgo::GetStreamId(kernel); - auto it = find_if(streams_list_.begin(), streams_list_.end(), - [stream_id](const SomasStreamPtr &s) { return s->GetId() == stream_id; }); - if (it == streams_list_.end()) { + size_t stream_id = i; + if (depend_exec_order_) { + stream_id = AnfAlgo::GetStreamId(kernel); + } + auto it = streams_map_.find(stream_id); + if (it == streams_map_.end()) { stream = std::make_shared(stream_id); - streams_list_.push_back(stream); + streams_map_[stream_id] = stream; } else { - stream = *it; + stream = (*it).second; } // Node @@ -414,31 +624,22 @@ void Somas::InitSomasStreamAndNode(const session::KernelGraph *graph) { if (common::AnfAlgo::IsCommunicationOp(kernel)) { type = kCommunicationNode; } - auto node = std::make_shared(kernel->fullname_with_scope(), node_index, type, stream->GetId()); + auto node = std::make_shared(kernel->fullname_with_scope(), i, type, stream->GetId()); MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_CHECK_FAIL(nodes_list_.size() == i, "node_list_ size error!!!"); nodes_list_.push_back(node); stream->nodes_.push_back(node); auto key = kernel.get(); auto &nodes = nodes_map_[key]; nodes.push_back(node); - node_index++; - } - - // make nodes_id map - for (const auto &node : nodes_list_) { - if (nodes_id_map_.find(node->GetId()) != nodes_id_map_.end()) { - MS_LOG(EXCEPTION) << "Duplicate node id [" << node->GetId() << "]"; - } - nodes_id_map_[node->GetId()] = node; } } -void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph) { +void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph &graph) { MS_LOG(DEBUG) << "Somas InitSomasOutputAndWorkspaceTensors start..."; - MS_EXCEPTION_IF_NULL(graph); tensors_list_ = {}; size_t tensor_index = 0; - auto kernel_cnodes = graph->execution_order(); + auto &kernel_cnodes = graph.execution_order(); for (const auto &kernel : kernel_cnodes) { auto nodes = nodes_map_[kernel.get()]; auto node = nodes[0]; @@ -449,77 +650,75 @@ void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); auto output_sizes = kernel_mod->GetOutputSizeList(); - auto index = 0; for (const auto &size : output_sizes) { auto output_tensor_index = tensor_index; tensor_index++; - // Set all output tensor lifelong to true. - auto tensor = std::make_shared(output_tensor_index, node->GetId(), stream_id, size, kLifeLongNone); + size_t aligned_size = GetAlignSize(size); + if (aligned_size == 0) { + // Device Address still need to be allocated when output_size is 0 + aligned_size = GetAlignSize(kZeroAlignSize); + } + MS_LOG(INFO) << "Node " << kernel->fullname_with_scope() << " output size " << size << " align size " + << aligned_size; + auto tensor = + std::make_shared(output_tensor_index, node->GetId(), stream_id, size, aligned_size, kLifeLongNone); MS_EXCEPTION_IF_NULL(tensor); tensor->lifetime_.start_ = node->GetId(); tensor->lifetime_.end_ = (nodes.size() > 1) ? nodes.back()->GetId() : node->GetId(); tensor->type_ = kOutputOnly; - if (AnfAlgo::OutputAddrExist(kernel, IntToSize(index))) { - tensor->aligned_size_ = 0; - } + MS_EXCEPTION_IF_CHECK_FAIL(tensors_list_.size() == output_tensor_index, "tensors_list_ size error!!!"); tensors_list_.push_back(tensor); - tensors_map_[output_tensor_index] = tensor; std::for_each(nodes.begin(), nodes.end(), [tensor](auto &node) { MS_EXCEPTION_IF_NULL(node); - node->tensors_.insert(tensor); node->output_tensors_.push_back(tensor); }); - index++; } // WorkSpace Tensor auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); - index = 0; for (const auto &size : workspace_sizes) { auto workspace_tensor_index = tensor_index; tensor_index++; - SomasTensorPtr tensor = - std::make_shared(workspace_tensor_index, node->GetId(), stream_id, size, kLifeLongNone); + size_t aligned_size = GetAlignSize(size); + if (aligned_size == 0) { + // Device Address still need to be allocated when workspace_size is 0 + aligned_size = GetAlignSize(kZeroAlignSize); + } + SomasTensorPtr tensor = std::make_shared(workspace_tensor_index, node->GetId(), stream_id, size, + aligned_size, kLifeLongNone); MS_EXCEPTION_IF_NULL(tensor); tensor->type_ = kWorkspace; tensor->lifetime_.start_ = node->GetId(); tensor->lifetime_.end_ = (nodes.size() > 1) ? nodes.back()->GetId() : node->GetId(); - if (AnfAlgo::WorkspaceAddrExist(kernel, IntToSize(index))) { - tensor->aligned_size_ = 0; - } + + MS_EXCEPTION_IF_CHECK_FAIL(tensors_list_.size() == workspace_tensor_index, "tensors_list_ size error!!!"); tensors_list_.push_back(tensor); - tensors_map_[workspace_tensor_index] = tensor; std::for_each(nodes.begin(), nodes.end(), [tensor](auto &node) { MS_EXCEPTION_IF_NULL(node); - node->tensors_.insert(tensor); node->workspace_tensors_.push_back(tensor); }); - index++; } } } -void Somas::InitSomasInputTensors(const session::KernelGraph *graph) { +void Somas::InitSomasInputTensors(const session::KernelGraph &graph) { MS_LOG(DEBUG) << "Somas InitSomasInputTensors start..."; - MS_EXCEPTION_IF_NULL(graph); - bool is_all_nop_node = opt::IsAllNopNode(graph); static const auto enable_fusion_clear = (common::GetEnv("ENV_FUSION_CLEAR") == "1"); - auto kernel_cnodes = graph->execution_order(); + auto &kernel_cnodes = graph.execution_order(); for (const auto &kernel : kernel_cnodes) { if (common::AnfAlgo::GetCNodeName(kernel) != kAtomicAddrCleanOpName) { - InitCommonNodeInputs(is_all_nop_node, kernel); + InitCommonNodeInputs(kernel); } else { InitAtomicCleanInputs(enable_fusion_clear, kernel); } } } -void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) { +void Somas::InitCommonNodeInputs(const CNodePtr &kernel) { auto nodes = nodes_map_[kernel.get()]; auto node = nodes[0]; MS_EXCEPTION_IF_NULL(node); - auto stream_id = node->GetStreamId(); // Input Tensor auto input_tensor_num = common::AnfAlgo::GetInputTensorNum(kernel); @@ -527,17 +726,12 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) { for (size_t i = 0; i < input_tensor_num; i++) { auto input_node = kernel->input(i + 1); MS_EXCEPTION_IF_NULL(input_node); - session::KernelWithIndex prenode_index; - if (is_all_nop_node) { - prenode_index = common::AnfAlgo::VisitKernelWithReturnType(input_node, 0, false); - } else { - prenode_index = common::AnfAlgo::VisitKernelWithReturnType(input_node, 0, true); - } - if (common::AnfAlgo::CheckPrimitiveType(prenode_index.first, prim::kPrimMakeTuple)) { - MS_LOG(EXCEPTION) << "Input node [" << kernel->DebugString() << "]'s input " << i << " [" - << input_node->DebugString() << "] is MakeTuple"; - } + session::KernelWithIndex prenode_index = GetVisitKernelWithReturnType(input_node, 0); MS_EXCEPTION_IF_NULL(prenode_index.first); + if (common::AnfAlgo::CheckPrimitiveType(prenode_index.first, prim::kPrimMakeTuple)) { + MS_LOG(EXCEPTION) << "Node " << node->scope_full_name_ << "'s input node [" << input_node->DebugString() + << "]'s input " << i << " is MakeTuple"; + } if (!AnfUtils::IsRealCNodeKernel(prenode_index.first)) { auto op_name = common::AnfAlgo::GetCNodeName(kernel); TypeId input_origin_type = common::AnfAlgo::GetPrevNodeOutputInferDataType(kernel, i); @@ -556,7 +750,7 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) { MS_LOG(EXCEPTION) << "Kernel[" << kernel->fullname_with_scope() << "]'s input " << i << " [" << prenode_index.first->fullname_with_scope() << "] is not init."; } - auto pre_somas_node = iter->second.at(0); + SomasNodePtr pre_somas_node = iter->second.at(0); if (prenode_index.second > pre_somas_node->output_tensors_.size()) { MS_LOG(EXCEPTION) << "Output index " << prenode_index.second << " exceed input node [" << prenode_index.first->fullname_with_scope() << "]'s outputs size " @@ -573,6 +767,7 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) { for (auto &repeat_node : nodes) { input_somas_tensor->destination_nodes_.insert(repeat_node->GetId()); + input_somas_tensor->consumer_list_.emplace_back(repeat_node->GetId()); if (input_somas_tensor->lifetime_.end_ < repeat_node->GetId()) { input_somas_tensor->lifetime_.end_ = repeat_node->GetId(); } @@ -581,10 +776,6 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) { if (node != pre_somas_node) { node->ancestor_nodes_.insert(pre_somas_node); } - auto input_tensor_stream_id = input_somas_tensor->GetSourceStreamId(); - if (input_tensor_stream_id != stream_id) { - input_somas_tensor->between_streams_ = true; - } } } @@ -614,7 +805,7 @@ void Somas::InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kern MS_EXCEPTION_IF_NULL(input_somas_tensor); node->input_tensors_.push_back(input_somas_tensor); if (enable_fusion_clear) { - input_somas_tensor->lifelong_value_ = kLifeLongGraphAll; + input_somas_tensor->lifelong_value_ = kLifeLongGraphStart; MS_LOG(INFO) << "Set " << node->scope_full_name_ << "'s Input node " << pre_somas_node->scope_full_name_ << " 's output" << index << " to lifelong"; } @@ -633,7 +824,7 @@ void Somas::InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kern MS_EXCEPTION_IF_NULL(input_somas_tensor); node->input_tensors_.push_back(input_somas_tensor); if (enable_fusion_clear) { - input_somas_tensor->lifelong_value_ = kLifeLongGraphAll; + input_somas_tensor->lifelong_value_ = kLifeLongGraphStart; MS_LOG(INFO) << "Set " << node->scope_full_name_ << "'s Input node " << pre_somas_node->scope_full_name_ << " 's workspace" << index << " to lifelong"; } @@ -642,47 +833,6 @@ void Somas::InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kern } } -void Somas::InitSomasEventInfos() { - MS_LOG(DEBUG) << "Somas InitSomasEventInfos start..."; - event_map_ = {}; - std::map send_recv_map; -#ifdef ENABLE_D - send_recv_map = device::ascend::AscendStreamAssign::GetInstance().get_event_map(); -#endif - for (const auto &send_recv : send_recv_map) { - size_t event_id = common::AnfAlgo::GetNodeAttr(send_recv.first, kAttrEventId); - event_map_[event_id] = std::make_pair(send_recv.first, send_recv.second); - } - - auto tensor_index = tensors_list_.size(); - for (const auto &event : event_map_) { - std::pair send_recv_pair = event.second; - auto send_iter = nodes_map_.find(send_recv_pair.first.get()); - auto recv_iter = nodes_map_.find(send_recv_pair.second.get()); - if (send_iter == nodes_map_.end() || recv_iter == nodes_map_.end()) { - continue; - } - - auto &somas_send = send_iter->second.at(0); - auto &somas_recv = recv_iter->second.at(0); - auto output_tensor_index = tensor_index; - tensor_index++; - SomasTensorPtr tensor = std::make_shared(output_tensor_index, somas_send->GetId(), - somas_send->GetStreamId(), 0, kLifeLongNone); - tensor->lifetime_.start_ = somas_send->GetId(); - tensor->lifetime_.end_ = somas_recv->GetId(); - tensor->type_ = kEventVirtualOutput; - tensor->destination_nodes_.insert(somas_recv->GetId()); - somas_send->tensors_.insert(tensor); - somas_send->output_tensors_.push_back(tensor); - somas_recv->input_tensors_.push_back(tensor); - somas_recv->ancestor_nodes_.insert(somas_send); - tensors_list_.push_back(tensor); - tensors_map_[output_tensor_index] = tensor; - } - MS_LOG(DEBUG) << "Somas InitSomasEventInfos end."; -} - SomasParameterPtr Somas::CreateSomasParameter(const AnfNodePtr &node, size_t index) { MS_EXCEPTION_IF_NULL(node); auto id = parameters_list_.size(); @@ -722,95 +872,23 @@ SomasParameterPtr Somas::GetSomasParameter(const AnfNodePtr &node, size_t index) } } -void Somas::InitBasicInfo(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); -#ifdef ENABLE_D - streams_groups_ = device::ascend::AscendStreamAssign::GetInstance().get_stream_group(); -#endif +bool Somas::InitBasicInfoFromGraph(const session::KernelGraph &graph) { InitSomasStreamAndNode(graph); InitSomasOutputAndWorkspaceTensors(graph); InitSomasInputTensors(graph); - InitSomasEventInfos(); - - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - -#ifdef ENABLE_DUMP_IR - SubModuleId module = SubModuleId::SM_OPTIMIZER; - std::string name = "somas_initial_info." + std::to_string(graph->graph_id()); - (void)mindspore::RDR::RecordString(module, name, SomasInfo()); -#endif - - save_graphs_ = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); - save_graphs_path_ = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); - if (save_graphs_path_.empty()) { - save_graphs_path_ = "."; - } - if (save_graphs_) { - std::string file_path = - GetSaveGraphsPathName("/somas_initial_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_); - DumpSomasInfoIR(file_path); - } -} - -void Somas::GetNextOutputProcess(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto kernel_cnodes = graph->execution_order(); - size_t total_size = 0; - for (const auto &kernel : kernel_cnodes) { - if (common::AnfAlgo::GetCNodeName(kernel) != kGetNextOpName) { - continue; - } - auto iter = nodes_map_.find(kernel.get()); - if (iter != nodes_map_.end()) { - auto &node = iter->second.at(0); - MS_EXCEPTION_IF_NULL(node); - auto getnext_output_tensors = node->output_tensors_; - for (auto &tensor : getnext_output_tensors) { - MS_EXCEPTION_IF_NULL(tensor); - total_size += tensor->GetAlignedSize(); - tensor->lifelong_value_ = kLifeLongGraphAll; - tensor->type_ = kGetNextOutput; - } - } - } - MS_LOG(INFO) << "Special Tensor total size: GetNext Output " << total_size; -} - -void Somas::IndependentNodeOutputProcess(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto kernel_cnodes = graph->execution_order(); - size_t total_size = 0; - for (const auto &kernel : kernel_cnodes) { - bool independent = AnfAlgo::IsIndependentNode(kernel); - if (!independent) { - continue; - } - auto iter = nodes_map_.find(kernel.get()); - if (iter != nodes_map_.end()) { - auto &node = iter->second.at(0); - MS_EXCEPTION_IF_NULL(node); - auto semi_reuse_output_tensors = node->output_tensors_; - for (auto &tensor : semi_reuse_output_tensors) { - MS_EXCEPTION_IF_NULL(tensor); - total_size += tensor->GetAlignedSize(); - tensor->lifelong_value_ = kLifeLongGraphEnd; - } - } - } - - MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size; + InitControlTensors(graph); + GraphOutputProcess(graph); + return true; } #ifndef ENABLE_SECURITY -void Somas::SummaryInputProcess(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - bool summary_exist = graph->summary_node_exist(); +void Somas::SummaryInputProcess(const session::KernelGraph &graph) { + bool summary_exist = graph.summary_node_exist(); if (!summary_exist) { return; } - auto summary_nodes = graph->summary_nodes(); + auto summary_nodes = graph.summary_nodes(); if (summary_nodes.empty()) { return; } @@ -819,7 +897,7 @@ void Somas::SummaryInputProcess(const session::KernelGraph *graph) { for (const auto &node_item : summary_nodes) { auto origin_node = node_item.second.first; size_t origin_index = IntToSize(node_item.second.second); - auto item_with_index = common::AnfAlgo::VisitKernelWithReturnType(origin_node, origin_index, true); + auto item_with_index = GetVisitKernelWithReturnType(origin_node, origin_index); auto node = item_with_index.first; size_t index = item_with_index.second; auto iter = nodes_map_.find(node.get()); @@ -829,7 +907,7 @@ void Somas::SummaryInputProcess(const session::KernelGraph *graph) { if (index < input_node->output_tensors_.size()) { auto tensor = input_node->output_tensors_[index]; MS_EXCEPTION_IF_NULL(tensor); - tensor->lifelong_value_ = kLifeLongGraphAll; + tensor->lifelong_value_ = kLifeLongGraphEnd; tensor->type_ = kSummaryInput; total_summary_size += tensor->GetAlignedSize(); MS_LOG(INFO) << "Set summary node input tensor's lifelong, node: " << node->fullname_with_scope() @@ -847,9 +925,50 @@ void Somas::SummaryInputProcess(const session::KernelGraph *graph) { } #endif -void Somas::RefNodeProcess(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto kernel_cnodes = graph->execution_order(); +void Somas::GraphOutputProcess(const session::KernelGraph &graph) { + size_t count = 0; + auto outputs = common::AnfAlgo::GetAllOutputWithIndex(graph.output()); + for (auto output_with_index : outputs) { + auto output_kernel = output_with_index.first; + MS_EXCEPTION_IF_NULL(output_kernel); + if (AnfUtils::IsRealCNodeKernel(output_kernel) && nodes_map_.find(output_kernel.get()) == nodes_map_.end()) { + auto cnode = output_kernel->cast(); + if (!common::AnfAlgo::IsNopNode(cnode)) { + MS_LOG(EXCEPTION) << "Node[" << cnode->fullname_with_scope() + << "] doesn't exist in nodes_map and is not a nop node!!!"; + } + output_with_index = common::AnfAlgo::VisitKernelWithReturnType(cnode->input(kNopNodeRealInputIndex), 0, false); + output_kernel = output_with_index.first; + } + + if (!AnfUtils::IsRealCNodeKernel(output_kernel)) { + continue; + } + + auto output_index = output_with_index.second; + auto iter = nodes_map_.find(output_kernel.get()); + if (iter != nodes_map_.end()) { + auto &node = iter->second.at(0); + MS_EXCEPTION_IF_NULL(node); + if (output_index <= node->output_tensors_.size()) { + auto &tensor = node->output_tensors_[output_index]; + tensor->aligned_size_ = 0; + tensor->type_ = kGraphOutput; + count++; + } else { + MS_LOG(EXCEPTION) << "Graph's output node " << output_kernel->fullname_with_scope() << "'s output index" + << output_index << " is larger than its output tensor number " + << node->output_tensors_.size(); + } + } else { + MS_LOG(EXCEPTION) << "Can't find somas node for graph output node " << output_kernel->fullname_with_scope(); + } + } + MS_LOG(INFO) << "Set " << count << " graph output tensors' aligned size to 0."; +} + +void Somas::RefNodeProcess(const session::KernelGraph &graph) { + auto &kernel_cnodes = graph.execution_order(); size_t total_output_size = 0; size_t total_input_size = 0; for (const auto &kernel : kernel_cnodes) { @@ -864,17 +983,30 @@ void Somas::RefNodeProcess(const session::KernelGraph *graph) { auto out_index = output_index; output_index++; session::AnfWithOutIndex out_pair(kernel, out_index); - if (graph->IsInRefOutputMap(out_pair)) { - auto origin_pair = graph->GetRefCorrespondOutput(out_pair); + if (graph.IsInRefOutputMap(out_pair)) { + auto origin_pair = graph.GetRefCorrespondOutput(out_pair); MS_EXCEPTION_IF_NULL(origin_pair.first); auto &node = nodes_map_[kernel.get()].at(0); MS_EXCEPTION_IF_NULL(node); auto output_tensor = node->output_tensors_[out_index]; MS_EXCEPTION_IF_NULL(output_tensor); - output_tensor->type_ = kRefNodeOutput; + output_tensor->type_ = kUnion; total_output_size += size; if (AnfUtils::IsRealCNodeKernel(origin_pair.first)) { + if (nodes_map_.find(origin_pair.first.get()) == nodes_map_.end()) { + auto cnode = origin_pair.first->cast(); + if (!common::AnfAlgo::IsNopNode(cnode)) { + MS_LOG(EXCEPTION) << "Node[" << origin_pair.first->fullname_with_scope() << "] find input node[" + << cnode->fullname_with_scope() + << "] doesn't exist in nodes_map and is not a nop node!!!!"; + } + origin_pair = common::AnfAlgo::VisitKernelWithReturnType(cnode->input(kNopNodeRealInputIndex), 0, false); + } + if (!origin_pair.first->isa()) { + MS_LOG(EXCEPTION) << "The origin_pair.first is not a cnode. Info origin_pair.first: " + << origin_pair.first->DebugString(); + } auto ori_node = origin_pair.first->cast(); auto ori_index = origin_pair.second; if (nodes_map_.find(ori_node.get()) == nodes_map_.end()) { @@ -886,13 +1018,16 @@ void Somas::RefNodeProcess(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(repeat_node); auto input_tensor = repeat_node->output_tensors_[ori_index]; MS_EXCEPTION_IF_NULL(input_tensor); - input_tensor->type_ = kRefNodeInput; + input_tensor->type_ = kUnion; total_input_size += input_tensor->aligned_size_; std::vector refnode_input_output; refnode_input_output.push_back(input_tensor->GetId()); refnode_input_output.push_back(output_tensor->GetId()); - ref_node_constraints_.push_back(refnode_input_output); + union_tensors_list_.push_back(refnode_input_output); MS_LOG(INFO) << "RefNode: input " << input_tensor->GetId() << " output " << output_tensor->GetId(); + } else { + output_tensor->type_ = kGraphInput; + output_tensor->aligned_size_ = 0; } } } @@ -901,66 +1036,60 @@ void Somas::RefNodeProcess(const session::KernelGraph *graph) { MS_LOG(INFO) << "Special Tensor total size: RefNode: input " << total_input_size << " output " << total_output_size; } -void Somas::NonTaskSplitProcess(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - auto kernel_cnodes = graph->execution_order(); - for (const auto &kernel : kernel_cnodes) { - auto op_name = common::AnfAlgo::GetCNodeName(kernel); - if (common::AnfAlgo::IsNonTaskOp(kernel)) { - std::vector refnode_input_output; - auto node = nodes_map_[kernel.get()].at(0); - MS_EXCEPTION_IF_NULL(node); - if (node->input_tensors_.size() == 0) { - MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process."; - } - auto input_tensor = node->input_tensors_[0]; - MS_EXCEPTION_IF_NULL(input_tensor); - input_tensor->type_ = kRefNodeInput; - refnode_input_output.push_back(input_tensor->GetId()); - - for (auto &output_tensor : node->output_tensors_) { - MS_EXCEPTION_IF_NULL(output_tensor); - output_tensor->type_ = kRefNodeOutput; - refnode_input_output.push_back(output_tensor->GetId()); - } - ref_node_constraints_.push_back(refnode_input_output); - } +void Somas::UnReuseNodeProcess(const session::KernelGraph &graph) { + std::map full_name_type = {}; + for (const auto &node : un_reuse_node_name_) { + full_name_type.insert(node); } -} -void Somas::UnReuseNodeProcess(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); - vector full_name_list = {}; - if (full_name_list.size() == 0) { + auto &kernel_cnodes = graph.execution_order(); + for (const auto &kernel : kernel_cnodes) { + auto type = common::AnfAlgo::GetCNodeName(kernel); + auto iter = un_reuse_node_type_.find(type); + if (iter == un_reuse_node_type_.end()) { + continue; + } + auto full_name = kernel->fullname_with_scope(); + full_name_type[full_name] = iter->second; + } + + if (full_name_type.empty()) { return; } - auto kernel_cnodes = graph->execution_order(); for (const auto &kernel : kernel_cnodes) { MS_EXCEPTION_IF_NULL(kernel); auto full_name = kernel->fullname_with_scope(); - auto iter = std::find(full_name_list.begin(), full_name_list.end(), full_name); - if (iter != full_name_list.end()) { - MS_LOG(INFO) << "Set UnReuse Node in somas, Node:" << full_name; - auto key = kernel.get(); - auto somas_node = nodes_map_[key].at(0); - MS_EXCEPTION_IF_NULL(somas_node); - // input + auto iter = full_name_type.find(full_name); + if (iter == full_name_type.end()) { + continue; + } + auto un_reuse_type = iter->second; + MS_LOG(INFO) << "Set UnReuse Node in somas, Node:" << iter->first << ", UnReuse type:" << un_reuse_type; + auto key = kernel.get(); + auto somas_node = nodes_map_[key].at(0); + MS_EXCEPTION_IF_NULL(somas_node); + // input + if (un_reuse_type == UnReuseType::kUnReuseAll || un_reuse_type == UnReuseType::kUnReuseInput) { auto inputs = somas_node->input_tensors_; for (auto &input : inputs) { MS_EXCEPTION_IF_NULL(input); input->lifelong_value_ = kLifeLongGraphAll; } + } - // output + // output + if (un_reuse_type == UnReuseType::kUnReuseAll || un_reuse_type == UnReuseType::kUnReuseOutput) { auto outputs = somas_node->output_tensors_; MS_LOG(INFO) << "Output size of " << kernel->fullname_with_scope() << " is " << outputs.size(); for (auto &output : outputs) { MS_EXCEPTION_IF_NULL(output); output->lifelong_value_ = kLifeLongGraphAll; } + } - // workspace + // workspace + if (un_reuse_type == UnReuseType::kUnReuseAll || un_reuse_type == UnReuseType::kUnReuseWorkspace) { auto workspaces = somas_node->workspace_tensors_; for (auto &workspace : workspaces) { MS_EXCEPTION_IF_NULL(workspace); @@ -970,8 +1099,7 @@ void Somas::UnReuseNodeProcess(const session::KernelGraph *graph) { } } -void Somas::GenContiguousList(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(graph); +void Somas::CommunicationNodeProcess(const session::KernelGraph &graph) { for (const auto &node : nodes_list_) { MS_EXCEPTION_IF_NULL(node); if (node->GetType() != kCommunicationNode) { @@ -980,12 +1108,14 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) { // Contiguous input if ((!node->input_tensors_.empty()) && (!node->input_tensors_[0]->contiguous_)) { + // add gap for first and last input if (node->input_tensors_[0]->aligned_size_ != 0) { - node->input_tensors_[0]->aligned_size_ += kGapSize; + node->input_tensors_[0]->aligned_size_ += communication_gap_size_; } if (node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ != 0) { - node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ += kGapSize; + node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ += communication_gap_size_; } + std::vector inputs; for (const auto &input_tensor : node->input_tensors_) { MS_EXCEPTION_IF_NULL(input_tensor); @@ -1002,12 +1132,14 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) { // Contiguous output if ((!node->output_tensors_.empty()) && (!node->output_tensors_[0]->contiguous_)) { + // add gap for first and last output if (node->output_tensors_[0]->aligned_size_ != 0) { - node->output_tensors_[0]->aligned_size_ += kGapSize; + node->output_tensors_[0]->aligned_size_ += communication_gap_size_; } if (node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ != 0) { - node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ += kGapSize; + node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ += communication_gap_size_; } + std::vector outputs; for (const auto &output_tensor : node->output_tensors_) { MS_EXCEPTION_IF_NULL(output_tensor); @@ -1021,9 +1153,22 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) { } contiguous_tensors_list_.push_back(outputs); } + + // check the tensors of the list + std::set all_contiguous_tensors_set; + size_t all_contiguous_tensors_num = 0; + for (auto &tensors : contiguous_tensors_list_) { + all_contiguous_tensors_num += tensors.size(); + all_contiguous_tensors_set.insert(tensors.begin(), tensors.end()); + } + if (all_contiguous_tensors_num != all_contiguous_tensors_set.size()) { + MS_LOG(EXCEPTION) << "Please check the CommunicationNodes, some tensor are in multiple contiguous list"; + } } } +bool Somas::NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2) { return node1->GetId() < node2->GetId(); } + void Somas::BuildConflictInfo(const std::shared_ptr &tensor, TensorConflictInfo *tensor_conflict_info, std::vector *destination_node_list) { const auto &consumer_list = tensor->consumer_list_; @@ -1044,12 +1189,7 @@ void Somas::BuildConflictInfo(const std::shared_ptr &tensor, Tensor } } -void Somas::ComputeConflictPairs() { - if (tensors_list_.empty()) { - MS_LOG(INFO) << "No Tensor for Conflict computing"; - return; - } - +void Somas::ComputeBasicMatrix() { MS_LOG(INFO) << "Start Conflict Computing (Bitset Model)"; auto start_conflict = std::chrono::system_clock::now(); std::sort(nodes_list_.begin(), nodes_list_.end(), NodeSort); @@ -1127,11 +1267,8 @@ void Somas::ComputeConflictPairs() { common::ThreadPool::GetInstance().SyncRun(tasks); } - ProcessSemiLifeLongTensor(); - - MS_LOG(INFO) << "End Tensor Relation Computing"; auto end_conflict = std::chrono::system_clock::now(); - MS_LOG(INFO) << "End Conflict Computing (Bitset Model)(time taken " + MS_LOG(INFO) << "End Basic Conflict Computing (Bitset Model)(time taken " << std::chrono::duration_cast(end_conflict - start_conflict).count() << "ms)"; } @@ -1146,8 +1283,13 @@ void Somas::ProcessSemiLifeLongTensor() { if (calc_tensor == target_tensor) { continue; } - if ((calc_tensor->IsSemiLifelongStart() && target_tensor->GetId() < calc_tensor->GetId()) || - (calc_tensor->IsSemiLifelongEnd() && target_tensor->GetId() > calc_tensor->GetId())) { + if (depend_exec_order_) { + if ((calc_tensor->IsSemiLifelongStart() && target_tensor->GetId() < calc_tensor->GetId()) || + (calc_tensor->IsSemiLifelongEnd() && target_tensor->GetId() > calc_tensor->GetId())) { + reuse_matrix_[calc_tensor->GetId()].SetBitFalse(target_tensor->GetId()); + reuse_matrix_[target_tensor->GetId()].SetBitFalse(calc_tensor->GetId()); + } + } else { reuse_matrix_[calc_tensor->GetId()].SetBitFalse(target_tensor->GetId()); reuse_matrix_[target_tensor->GetId()].SetBitFalse(calc_tensor->GetId()); } @@ -1155,65 +1297,62 @@ void Somas::ProcessSemiLifeLongTensor() { } } +void Somas::ComputeConflictMatrix() { + if (tensors_list_.empty()) { + MS_LOG(INFO) << "No Tensor for Conflict computing"; + return; + } + ComputeBasicMatrix(); + ProcessSemiLifeLongTensor(); + UpdateUnionTensorsConflict(); +} + +void Somas::UpdateContiguousTensorList() { + processed_contiguous_tensors_list_.clear(); + processed_contiguous_tensors_list_.insert(processed_contiguous_tensors_list_.end(), contiguous_tensors_list_.begin(), + contiguous_tensors_list_.end()); + std::set> contiguous_tensors_list_to_remove; + + GetContiguousListContainUnionTensor(); + for (const auto &ref_list_pair : contiguous_list_with_ref_index_map_) { + contiguous_tensors_list_to_remove.insert(contiguous_tensors_list_[ref_list_pair.second]); + } + + // remove the contiguous list which all tensors' align size is 0 + for (const auto &contiguous_list : contiguous_tensors_list_) { + bool all_outputs = true; + for (auto tensor_id : contiguous_list) { + auto tensor = tensors_list_[tensor_id]; + MS_EXCEPTION_IF_NULL(tensor); + if (tensor->aligned_size_ != 0) { + all_outputs = false; + break; + } + } + + if (all_outputs) { + contiguous_tensors_list_to_remove.insert(contiguous_list); + } + } + + for (const auto &contiguous_list : contiguous_tensors_list_to_remove) { + auto iterator = + std::find(processed_contiguous_tensors_list_.begin(), processed_contiguous_tensors_list_.end(), contiguous_list); + if (iterator != processed_contiguous_tensors_list_.end()) { + processed_contiguous_tensors_list_.erase(iterator); + } else { + MS_LOG(WARNING) << "Could not find contiguous list to remove for ref"; + } + } +} + void Somas::UpdateTensorDestinations() { - // Loop to add edges within each stream (node order within stream) - for (const auto &stream : streams_list_) { - MS_EXCEPTION_IF_NULL(stream); - auto &nodes = stream->nodes_; - std::sort(nodes.begin(), nodes.end(), NodeSort); - for (size_t i = 1; i < nodes.size(); i++) { - const auto &previous_node = nodes[i - 1]; - const auto ¤t_node = nodes[i]; - MS_EXCEPTION_IF_NULL(current_node); - current_node->ancestor_nodes_.insert(previous_node); - } - } - - // Loop to add edges from end to beginning of next group - for (const auto &group : streams_groups_) { - for (size_t i = 1; i < group.size(); i++) { - size_t previous_stream = group[i - 1]; - size_t current_stream = group[i]; - - auto stream = GetSomasStream(previous_stream); - if (stream == nullptr) { - continue; - } - - auto &last_node_in_prev_stream = stream->nodes_.back(); - - stream = GetSomasStream(current_stream); - if (stream == nullptr) { - continue; - } - auto &first_node_in_cur_stream = stream->nodes_.front(); - - first_node_in_cur_stream->ancestor_nodes_.insert(last_node_in_prev_stream); - } - } - // Loop to avoid tensors with empty destinations (add itself) for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); - if (tensor->destination_nodes_.size() == 0) { + if (tensor->destination_nodes_.empty()) { tensor->destination_nodes_.insert(tensor->GetSourceNodeId()); - } - } - - mindspore::HashMap stream_max_destination_node; - // Loop to compute max destinations in each stream - for (const auto &tensor : tensors_list_) { - MS_EXCEPTION_IF_NULL(tensor); - stream_max_destination_node.clear(); - for (const auto &node_id : tensor->destination_nodes_) { - auto node = GetSomasNode(node_id); - MS_EXCEPTION_IF_NULL(node); - if (node_id > stream_max_destination_node[node->GetStreamId()]) { - stream_max_destination_node[node->GetStreamId()] = node_id; - } - } - for (const auto &dst_map : stream_max_destination_node) { - tensor->consumer_list_.emplace_back(dst_map.second); + tensor->consumer_list_.emplace_back(tensor->GetSourceNodeId()); } } } @@ -1270,7 +1409,7 @@ void Somas::ComputeOneTensorConflicts(const std::shared_ptr &target const std::vector &tensor_conflict_info_list, const std::vector &destination_node_list, const vector &nodes_dependency, - std::vector *tensor_relation) const { + std::vector *tensor_relation) { MS_EXCEPTION_IF_NULL(target_tensor); auto target_tensor_id = target_tensor->GetId(); auto target_src_node_id = target_tensor->GetSourceNodeId(); @@ -1281,86 +1420,41 @@ void Somas::ComputeOneTensorConflicts(const std::shared_ptr &target // the conflict info of per calc_tensor for (const auto &tensor_conflict_info : tensor_conflict_info_list) { - if (tensor_conflict_info.tensor_id_ == target_tensor_id || - tensor_conflict_info.src_node_id_ == target_src_node_id) { + if (tensor_conflict_info.tensor_id == target_tensor_id || tensor_conflict_info.src_node_id == target_src_node_id) { continue; } if (CheckIsDependency(tensor_conflict_info, target_src_node_id, nodes_dependency, destination_node_list) || - CheckIsDependency(target_info, tensor_conflict_info.src_node_id_, nodes_dependency, + CheckIsDependency(target_info, tensor_conflict_info.src_node_id, nodes_dependency, target_destination_node_list)) { // calc_tensor and target_tensor have dependencies so they can reuse each other - (*tensor_relation)[target_tensor_id].SetBitTrue(tensor_conflict_info.tensor_id_); + (*tensor_relation)[target_tensor_id].SetBitTrue(tensor_conflict_info.tensor_id); } } } -bool Somas::NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2) { return node1->GetId() < node2->GetId(); } - -bool Somas::Assign(const session::KernelGraph *graph) { - MS_LOG(DEBUG) << "Somas Assign start..."; +bool Somas::Solve(const session::KernelGraph &graph) { + MS_LOG(INFO) << "Somas Assign start..."; if (tensors_list_.empty()) { MS_LOG(INFO) << "No Tensor for Assigner"; return true; } - // Ref Node Preprocessing - UpdateRefTensorsConflict(); - std::map contiguous_list_with_ref_index_map = GetContiguousListContainRefTensor(); - vector> contiguous_tensors_list_removed = contiguous_tensors_list_; - std::set> contiguous_tensors_list_to_remove; - for (const auto &ref_list_pair : contiguous_list_with_ref_index_map) { - contiguous_tensors_list_to_remove.insert(contiguous_tensors_list_[ref_list_pair.second]); - } - - // remove the contiguous list which all tensors' align size is 0 - for (auto contiguous_list : contiguous_tensors_list_) { - bool all_outputs = true; - for (auto tensor_id : contiguous_list) { - auto tensor = tensors_list_[tensor_id]; - MS_EXCEPTION_IF_NULL(tensor); - if (tensor->aligned_size_ != 0) { - all_outputs = false; - break; - } - } - - if (all_outputs) { - contiguous_tensors_list_to_remove.insert(contiguous_list); - } - } - - for (const auto &contiguous_list : contiguous_tensors_list_to_remove) { - auto iterator = - std::find(contiguous_tensors_list_removed.begin(), contiguous_tensors_list_removed.end(), contiguous_list); - if (iterator != contiguous_tensors_list_removed.end()) { - contiguous_tensors_list_removed.erase(iterator); - } else { - MS_LOG(WARNING) << "Could not find contiguous list to remove for ref"; - } - } - MS_LOG(INFO) << "End Solving Preprocessing for Ref Node"; - UpdateRefOverlapTensorsConflicts(); - -#ifdef SOMAS_DEBUG - // Compute number of constraints for each tensor + // Compute number of constraints for each tensor which will used in solver auto tensors_num = tensors_list_.size(); - for (auto tensor1 : tensors_list_) { - auto ones_num = reuse_matrix_[tensor1->GetId()].CountOnesNum(); - tensor1->num_constraints_ = tensors_num - ones_num; + for (const auto &tensor : tensors_list_) { + auto ones_num = reuse_matrix_[tensor->GetId()].CountOnesNum(); + tensor->num_constraints_ = tensors_num - ones_num; } -#endif // Prepare solver info - MS_LOG(INFO) << "Start Loop to create solver info"; - for (auto tensor : tensors_list_) { + for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); if (tensor->GetSolverTensorDesc() != nullptr) { SomasSolverTensorDescPtr pSolverTensor = tensor->GetSolverTensorDesc(); (void)solver_tensor_desc_map_.emplace(pSolverTensor->index_, pSolverTensor); } } - MS_LOG(INFO) << "End Loop to create solver info"; MS_LOG(INFO) << "Start Solving"; if (solver_tensor_desc_map_.empty()) { @@ -1370,7 +1464,7 @@ bool Somas::Assign(const session::KernelGraph *graph) { somas_solver_ = std::make_shared(); auto status = - somas_solver_->Solving(graph, &solver_tensor_desc_map_, &reuse_matrix_, contiguous_tensors_list_removed, false); + somas_solver_->Solving(graph, &solver_tensor_desc_map_, &reuse_matrix_, processed_contiguous_tensors_list_, false); MS_LOG(INFO) << "End Solving"; if (status != SUCCESS) { GenGraphStatisticInfo(); @@ -1383,18 +1477,18 @@ bool Somas::Assign(const session::KernelGraph *graph) { tensor->SetOffset(); } - UpdateRefTensorsOffset(); - UpdateContiguousTensorsOffset(contiguous_list_with_ref_index_map); + UpdateUnionTensorsOffset(); + UpdateContiguousTensorsOffset(contiguous_list_with_ref_index_map_); - // Set mem_offset_ value by solver result - mem_offset_ = static_cast(somas_solver_->GetMaxOffset()); - MS_LOG(DEBUG) << "Somas Assign end."; + reused_memory_size_ = static_cast(somas_solver_->GetMaxOffset()); + + MS_LOG(INFO) << "Somas Assign end."; return true; } -std::map Somas::GetContiguousListContainRefTensor() { - // key: contiguous list index with ref node input; value: contiguous list index with ref node output - std::map contiguous_list_with_ref_index_map; +void Somas::GetContiguousListContainUnionTensor() { + // key: contiguous list index with first union tensor; value: contiguous list index with other union tensor + contiguous_list_with_ref_index_map_.clear(); std::map ref_tensors_in_contiguous_map = GetRefTensorsInContiguousList(); std::map>> contiguous_ref_list_error_check_map; for (const auto &ref_pair : ref_tensors_in_contiguous_map) { @@ -1433,9 +1527,9 @@ std::map Somas::GetContiguousListContainRefTensor() { if (!found_second) { MS_LOG(WARNING) << "Contiguous ref tensor " << ref_second << " not found in any contiguous list"; } - if (contiguous_list_with_ref_index_map.find(index_first) == contiguous_list_with_ref_index_map.end() || - contiguous_list_with_ref_index_map[index_first] == index_second) { - contiguous_list_with_ref_index_map[index_first] = index_second; + if (contiguous_list_with_ref_index_map_.find(index_first) == contiguous_list_with_ref_index_map_.end() || + contiguous_list_with_ref_index_map_[index_first] == index_second) { + contiguous_list_with_ref_index_map_[index_first] = index_second; // Checking for error cases if (index_in_list_first != index_in_list_second) { MS_LOG(WARNING) << "Inconsistency in contiguous ref: tensor " << ref_first << " in position " @@ -1445,7 +1539,7 @@ std::map Somas::GetContiguousListContainRefTensor() { contiguous_ref_list_error_check_map[index_first][index_second].insert(index_in_list_first); } else { MS_LOG(WARNING) << "Contiguous list " << index_first << " associated (ref node) with two other contiguous lists: " - << contiguous_list_with_ref_index_map[index_first] << " and " << index_second; + << contiguous_list_with_ref_index_map_[index_first] << " and " << index_second; } } @@ -1466,24 +1560,23 @@ std::map Somas::GetContiguousListContainRefTensor() { } } } - return contiguous_list_with_ref_index_map; } std::map Somas::GetRefTensorsInContiguousList() { // key: refnode input value: refnode output std::map ref_tensors_in_contiguous_map; - for (auto ref_node_list : ref_node_constraints_) { + for (auto ref_node_list : union_tensors_list_) { // Count contiguous tensors in ref list auto contiguous_in_ref_list = std::count_if(ref_node_list.begin(), ref_node_list.end(), - [this](size_t tid) { return tensors_map_[tid]->contiguous_; }); + [this](size_t tid) { return tensors_list_[tid]->contiguous_; }); // Keep info about contiguous and check for errors if (ref_node_list.size() > kRefNodeTensorNum && contiguous_in_ref_list > 0) { MS_LOG(WARNING) << "Ref node of size greater than two with at least one contiguous tensor in"; } if (ref_node_list.size() == kRefNodeTensorNum && contiguous_in_ref_list == 1) { MS_LOG(WARNING) << "Ref node of size two with only one contiguous tensor" << ref_node_list[0] << ":" - << tensors_map_[ref_node_list[0]]->contiguous_ << ", " << ref_node_list[1] << ":" - << tensors_map_[ref_node_list[1]]->contiguous_; + << tensors_list_[ref_node_list[0]]->contiguous_ << ", " << ref_node_list[1] << ":" + << tensors_list_[ref_node_list[1]]->contiguous_; } if (ref_node_list.size() == kRefNodeTensorNum && LongToSize(contiguous_in_ref_list) == kRefNodeTensorNum) { ref_tensors_in_contiguous_map[ref_node_list[0]] = ref_node_list[1]; @@ -1498,54 +1591,36 @@ void Somas::UpdateContiguousTensorsOffset(const std::map &contig size_t index_first = ref_list_pair.first; size_t index_second = ref_list_pair.second; for (size_t x = 0; x < contiguous_tensors_list_[index_second].size(); x++) { - tensors_map_[contiguous_tensors_list_[index_second][x]]->offset_ = - tensors_map_[contiguous_tensors_list_[index_first][x]]->offset_; + tensors_list_[contiguous_tensors_list_[index_second][x]]->offset_ = + tensors_list_[contiguous_tensors_list_[index_first][x]]->offset_; + tensors_list_[contiguous_tensors_list_[index_second][x]]->aligned_size_ = + tensors_list_[contiguous_tensors_list_[index_first][x]]->aligned_size_; } } // Contiguous gaps postprocessing for (auto list : contiguous_tensors_list_) { - tensors_map_[list[0]]->offset_ += kGapSize; + tensors_list_[list[0]]->offset_ += communication_gap_size_; } } -void Somas::UpdateRefTensorsOffset() { - // Ref Node Postprocessing - MS_LOG(INFO) << "\nStart Solving Postprocessing for Ref Node"; +void Somas::UpdateUnionTensorsOffset() { // Set offset for rest of ref node list (ignored by solver due to ref node preprocessing) - for (auto ref_node_list : ref_node_constraints_) { + for (auto ref_node_list : union_tensors_list_) { for (size_t i = 1; i < ref_node_list.size(); ++i) { - tensors_map_[ref_node_list[i]]->offset_ = tensors_map_[ref_node_list[0]]->offset_; + tensors_list_[ref_node_list[i]]->offset_ = tensors_list_[ref_node_list[0]]->offset_; + tensors_list_[ref_node_list[i]]->aligned_size_ = tensors_list_[ref_node_list[0]]->aligned_size_; } } } -void Somas::UpdateRefOverlapTensorsConflicts() { - // Ref Overlap Preprocessing - MS_LOG(INFO) << "Start Solving Preprocessing for Ref Overlap"; - // In ConflictComputing(), by use of ref_overlap_ flag, each tensor in a ref_overlap_list has all entries 1 in - // cannot_reuse_ array Here, we allow reuse only among tensors in same list - for (auto ref_overlap_list : ref_overlap_constraints_) { - for (size_t tid_1 : ref_overlap_list) { - for (size_t tid_2 : ref_overlap_list) { - reuse_matrix_[tid_1].SetBitTrue(tid_2); - reuse_matrix_[tid_2].SetBitTrue(tid_1); - } - } - } - MS_LOG(INFO) << "End Solving Preprocessing for Ref Overlap"; -} - -void Somas::UpdateRefTensorsConflict() { +void Somas::UpdateUnionTensorsConflict() { // Keep all constraints for first tensor in list - for (auto ref_node_list : ref_node_constraints_) { - size_t tid_0 = ref_node_list[0]; - for (SomasTensorPtr tensor : tensors_list_) { - if (reuse_matrix_[tid_0].IsBitTrue(tensor->GetId()) == false) { - continue; - } - for (size_t tid : ref_node_list) { - if (reuse_matrix_[tid].IsBitTrue(tensor->GetId()) == false) { + for (auto union_node_list : union_tensors_list_) { + size_t tid_0 = union_node_list[0]; + for (const SomasTensorPtr &tensor : tensors_list_) { + for (size_t tid : union_node_list) { + if (!reuse_matrix_[tid].IsBitTrue(tensor->GetId())) { reuse_matrix_[tid_0].SetBitFalse(tensor->GetId()); reuse_matrix_[tensor->GetId()].SetBitFalse(tid_0); break; @@ -1553,15 +1628,20 @@ void Somas::UpdateRefTensorsConflict() { } } // Set rest to size 0, so that solver ignores them (if not contiguous) - for (size_t i = 1; i < ref_node_list.size(); ++i) { - if (!tensors_map_[ref_node_list[i]]->contiguous_) { - tensors_map_[ref_node_list[i]]->aligned_size_ = 0; + for (size_t i = 1; i < union_node_list.size(); ++i) { + if (!tensors_list_[union_node_list[i]]->contiguous_) { + if (tensors_list_[union_node_list[i]]->aligned_size_ > tensors_list_[union_node_list[0]]->aligned_size_) { + MS_LOG(WARNING) << "The aligned_size of union tensor " << tensors_list_[union_node_list[i]]->GetId() + << " is bigger than the aligned_size of union tensor " + << tensors_list_[union_node_list[0]]->GetId(); + } + tensors_list_[union_node_list[i]]->aligned_size_ = 0; } } } } -std::string Somas::GetSplitName(const std::string &scope_name) const { +std::string Somas::GetSplitName(const std::string &scope_name) { auto index = scope_name.rfind('/'); if (index == std::string::npos) { return scope_name; @@ -1582,6 +1662,33 @@ std::string Somas::SomasInfo(bool calc_hash) const { DumpTensors(oss); DumpNodes(oss); + oss << "\n\nAll Union Tensors Info:\n\n"; + for (const auto &ref_in_out : union_tensors_list_) { + oss << "union tensors: ["; + for (const auto &item : ref_in_out) { + oss << "%" << item << "T "; + } + oss << "]\n"; + } + + oss << "\n\nAll Original Contiguous Tensors Info:\n\n"; + for (const auto &contiguous : contiguous_tensors_list_) { + oss << "contiguous tensors: ["; + for (const auto &item : contiguous) { + oss << "%" << item << "T "; + } + oss << "]\n"; + } + + oss << "\n\nAll Processed Contiguous Tensors Info:\n\n"; + for (const auto &contiguous : processed_contiguous_tensors_list_) { + oss << "contiguous tensors: ["; + for (const auto &item : contiguous) { + oss << "%" << item << "T "; + } + oss << "]\n"; + } + oss << "\n\nAll Stream Groups:\n\n"; for (const auto &stream_group : streams_groups_) { for (const auto &stream : stream_group) { @@ -1590,25 +1697,13 @@ std::string Somas::SomasInfo(bool calc_hash) const { oss << "\n"; } - if (!ref_node_constraints_.empty()) { - oss << "\n\nAll Ref Node Info:\n\n"; - for (const auto &ref_in_out : ref_node_constraints_) { - oss << "refnode input-output:"; - for (const auto &item : ref_in_out) { - oss << "%" << item << "T "; - } - oss << "\n"; - } + oss << "\n\nAll Merged Blocks:\n\n"; + oss << "start_offset:" + << "\tsize:\n"; + for (const auto &merged_block : dump_merged_blocks_) { + oss << merged_block.first << "\t" << merged_block.second << "\n"; } - - for (const auto &event : event_map_) { - std::pair send_recv_pair = event.second; - std::string send_split_name = GetSplitName(send_recv_pair.first->fullname_with_scope()); - std::string recv_split_name = GetSplitName(send_recv_pair.second->fullname_with_scope()); - oss << "event_id:" << event.first << " send:" << send_split_name << " recv:" << recv_split_name; - oss << "\n"; - } - + oss << "\nTotal Memory Size after reused:" << reused_memory_size_; return oss.str(); } @@ -1633,8 +1728,8 @@ void Somas::DumpNodes(std::ostringstream &oss) const { tensor_index++; } } - oss << "]"; + oss << "\toutputs["; for (const auto &out : node->output_tensors_) { MS_EXCEPTION_IF_NULL(out); @@ -1642,6 +1737,7 @@ void Somas::DumpNodes(std::ostringstream &oss) const { << ", "; } oss << "]"; + oss << "\tworkspace["; for (const auto &wk : node->workspace_tensors_) { MS_EXCEPTION_IF_NULL(wk); @@ -1649,6 +1745,23 @@ void Somas::DumpNodes(std::ostringstream &oss) const { << ", "; } oss << "]"; + + oss << "\tctrl_inputs["; + for (const auto &ctrl_in : node->control_input_tensors_) { + MS_EXCEPTION_IF_NULL(ctrl_in); + oss << "%" << ctrl_in->GetId() << "CT" + << ", "; + } + oss << "]"; + + oss << "\tctrl_outputs["; + for (const auto &ctrl_out : node->control_output_tensors_) { + MS_EXCEPTION_IF_NULL(ctrl_out); + oss << "%" << ctrl_out->GetId() << "CT" + << ", "; + } + oss << "]"; + oss << "\tstreamID[" << "@" << node->GetStreamId() << "]\n"; } @@ -1657,10 +1770,9 @@ void Somas::DumpNodes(std::ostringstream &oss) const { void Somas::DumpTensors(std::ostringstream &oss) const { oss << "\n\nAll Tensors:\n\n"; oss << "index:" - << "\tsize:" - << "\treal_size:" + << "\taligned_size:" + << "\toriginal_size:" << "\toffset:" - << "\taddr:" << "\ttype:" << "\tlifelong:" << "\tlife_start:" @@ -1680,9 +1792,23 @@ void Somas::DumpTensors(std::ostringstream &oss) const { << "#" << tensor->GetOriginalSize() << "S" << "\t" << "&" << tensor->GetOffset() << "" + << "\t" << tensor->GetTypeString() << "\t" << tensor->GetLifelongString() << "\t" << tensor->lifetime_.start_ + << "\t" << tensor->lifetime_.end_ << "\t" << split_name << "\n"; + } + for (const auto &tensor : control_tensors_list_) { + MS_EXCEPTION_IF_NULL(tensor); + auto node = GetSomasNode(tensor->GetSourceNodeId()); + MS_EXCEPTION_IF_NULL(node); + auto scope_name = node->scope_full_name_; + std::string split_name = GetSplitName(scope_name); + oss << "%" << tensor->GetId() << "T" << "\t" - << "&" << static_cast(tensor->GetOffset() + mem_base_addr_) << "\t" - << tensor_type_name_map[tensor->type_] << "\t" << tensor->IsLifelong() << "\t" << tensor->lifetime_.start_ + << "#" << tensor->GetAlignedSize() << "S" + << "\t" + << "#" << tensor->GetOriginalSize() << "S" + << "\t" + << "&" << tensor->GetOffset() << "" + << "\t" << tensor->GetTypeString() << "\t" << tensor->GetLifelongString() << "\t" << tensor->lifetime_.start_ << "\t" << tensor->lifetime_.end_ << "\t" << split_name << "\n"; } } @@ -1691,7 +1817,6 @@ void Somas::DumpParameters(std::ostringstream &oss) const { oss << "All Parameters:\n\n"; oss << "index:" << "\tsize:" - << "\tstart_addr:" << "\tsource node name:" << "\tnode out index:\n"; @@ -1700,19 +1825,26 @@ void Somas::DumpParameters(std::ostringstream &oss) const { oss << "%" << param->id_ << "P" << "\t" << "#" << param->size_ << "S" - << "\t" - << "&" << param->addr_ << "\t" << param->source_node_name_ << "\t" << param->output_index_ << "\n"; + << "\t" << param->source_node_name_ << "\t" << param->output_index_ << "\n"; } } -void Somas::DumpSomasInfoIR(const string filename) const { (void)Common::SaveStringToFile(filename, SomasInfo()); } +void Somas::DumpSomasModelInfo(const string &tag, uint32_t graph_id) const { +#ifndef ENABLE_SECURITY + if (save_debug_info_) { + std::string file_path = + GetSaveGraphsPathName("/" + device_name_ + "_" + tag + "_" + std::to_string(graph_id) + ".ir", debug_info_path_); + (void)Common::SaveStringToFile(file_path, SomasInfo()); + } +#endif +} std::string Somas::Offline() const { std::ostringstream oss; - for (auto tensor : tensors_list_) { + for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); - if (tensor->IsOutputOnly() || tensor->type_ == TensorType::kRefNodeOutput) { + if (tensor->IsOutputOnly() || tensor->type_ == TensorType::kUnion) { oss << "Somas EDGE ERROR src=n" << tensor->GetSourceNodeId() << ", srcstm=" << tensor->GetSourceStreamId() << ", dst=nc" << ", dststm=nc" @@ -1736,7 +1868,7 @@ std::string Somas::Offline() const { } } } - for (vector tList : contiguous_tensors_list_) { + for (const vector &tList : contiguous_tensors_list_) { oss << "Somas CONTIGUOUS"; for (size_t tid : tList) { oss << " " << tid; @@ -1753,84 +1885,11 @@ std::string Somas::Offline() const { return oss.str(); } -void Somas::DumpOfflineIR(const string filename) const { +void Somas::DumpOfflineIR(const string &filename) const { MS_LOG(INFO) << "Printing somas-log-from-graph log: " << filename; (void)Common::SaveStringToFile(filename, Offline()); } -std::string Somas::SomasMemory() const { - std::ostringstream oss; - - std::map mem_map; - for (auto tensor : tensors_list_) { - MS_EXCEPTION_IF_NULL(tensor); - mem_map[tensor->GetOffset()] = 0; - } - - size_t num = 0; - for (auto iter = mem_map.begin(); iter != mem_map.end(); ++iter, ++num) { - iter->second = num; - } - - std::map> mem_list; - - for (const auto &output_tensor : tensors_list_) { - MS_EXCEPTION_IF_NULL(output_tensor); - size_t key = output_tensor->offset_; - auto iter = mem_list.find(key); - if (iter == mem_list.end()) { - std::map id_tensor_map; - id_tensor_map[output_tensor->GetId()] = output_tensor; - mem_list[key] = id_tensor_map; - } else { - iter->second[output_tensor->GetId()] = output_tensor; - } - } - - oss << "mem_id:" - << "\tstart_offset:" - << "\tend_offset:" - << "\ttensor_id:" - << "\torigin_size:" - << "\talign_size:" - << "\tstart_addr:" - << "\tend_addr:" - << "\ttype:" - << "\tsrc_node:" - << "\tsrc_stm_id:" - << "lifetime_start\t" - << "lifetime_end\n"; - - for (const auto &mem : mem_list) { - auto id_tensor_map = mem.second; - for (const auto &id_tensor : id_tensor_map) { - auto place_tensor = id_tensor.second; - MS_EXCEPTION_IF_NULL(place_tensor); - std::string scope_name; - int64_t src_stm_id = 0xffff; - auto node = GetSomasNode(place_tensor->GetSourceNodeId()); - if (node != nullptr) { - scope_name = node->scope_full_name_; - src_stm_id = SizeToLong(node->GetStreamId()); - } else { - scope_name = "Somas Tensor"; - } - - std::string split_name = GetSplitName(scope_name); - oss << "#" << mem_map[place_tensor->GetOffset()] << "\t" << place_tensor->GetOffset() << "\t" - << place_tensor->GetOffset() + place_tensor->GetAlignedSize() << "\t%" << place_tensor->GetId() << "T\t" - << place_tensor->GetOriginalSize() << "\t" << place_tensor->GetAlignedSize() << "\t&" - << static_cast(place_tensor->GetOffset() + mem_base_addr_) << "\t&" - << static_cast(place_tensor->GetOffset() + mem_base_addr_ + place_tensor->GetAlignedSize()) << "\t" - << tensor_type_name_map[place_tensor->type_] << "\t" << split_name << "\tstm" << src_stm_id << "\t" - << place_tensor->lifetime_.start_ << "\t" << place_tensor->lifetime_.end_ << "\n"; - } - } - return oss.str(); -} - -void Somas::DumpSomasMemoryIR(const string &filename) const { (void)Common::SaveStringToFile(filename, SomasMemory()); } - size_t Somas::CalcLowerBound() const { size_t max_node_id = std::accumulate(tensors_list_.begin(), tensors_list_.end(), 0, [](size_t max_id, auto tensor) { return std::max(max_id, tensor->lifetime_.end_); @@ -1884,8 +1943,8 @@ void Somas::GenGraphStatisticInfo() { } const double giga = 1024. * 1024. * 1024.; - MS_LOG(INFO) << "Lower Bound: " << lower_bound_ << " (" << lower_bound_ / giga - << " GB), Upper Bound: " << upper_bound_ << " (" << upper_bound_ / giga << " GB)"; + MS_LOG(INFO) << "Lower Bound: " << lower_bound_ << " (" << static_cast(lower_bound_) / giga + << " GB), Upper Bound: " << upper_bound_ << " (" << static_cast(upper_bound_) / giga << " GB)"; MS_LOG(INFO) << "\nTotal Dynamic Size (Upper Bound):\t" << upper_bound_ << "\n" << "Theoretical Optimal Size (Lower Bound):\t" << lower_bound_ << "\n" @@ -1895,104 +1954,73 @@ void Somas::GenGraphStatisticInfo() { << "Total LifeLong All Tensor Size:\t" << lifelong_all_total_size_ << "\n" << "Total LifeLong Start Tensor Size:\t" << lifelong_start_total_size_ << "\n" << "Total LifeLong End Tensor Size:\t" << lifelong_end_total_size_ << "\n" - << "Reused Size(Allocate Size):\t" << GetTotalMemSize() << "\n\n\n"; + << "Reused Size(Allocate Size):\t" << reused_memory_size_ << "\n\n\n"; } -uint8_t *Somas::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const { +std::vector> Somas::GetNodeOutputSomasResult(const AnfNodePtr &node) const { MS_EXCEPTION_IF_NULL(node); auto key = node.get(); auto iter = nodes_map_.find(key); - uint8_t *ptr = nullptr; + std::vector> output_somas_result; if (iter != nodes_map_.end()) { auto &somas_node = iter->second.at(0); MS_EXCEPTION_IF_NULL(somas_node); - if (index >= somas_node->output_tensors_.size()) { - MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:[" - << somas_node->output_tensors_.size() << "]"; - } - auto output_tensor = somas_node->output_tensors_[index]; - ptr = mem_base_addr_ + output_tensor->offset_; + std::transform(somas_node->output_tensors_.cbegin(), somas_node->output_tensors_.cend(), + std::back_inserter(output_somas_result), + [](const SomasTensorPtr &tensor) { return std::make_pair(tensor->offset_, tensor->aligned_size_); }); } else { MS_LOG(EXCEPTION) << "node [" << common::AnfAlgo::GetCNodeName(node) << "] don't exist in nodes_map"; } - return ptr; + return output_somas_result; } -uint8_t *Somas::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const { +std::vector> Somas::GetNodeWorkSpaceSomasResult(const AnfNodePtr &node) const { MS_EXCEPTION_IF_NULL(node); auto key = node.get(); auto iter = nodes_map_.find(key); - uint8_t *ptr = nullptr; + std::vector> workspace_somas_result; if (iter != nodes_map_.end()) { auto &somas_node = iter->second.at(0); MS_EXCEPTION_IF_NULL(somas_node); - if (index >= somas_node->workspace_tensors_.size()) { - MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's workspace size:[" - << somas_node->workspace_tensors_.size() << "]"; - } - auto workspace_tensor = somas_node->workspace_tensors_[index]; - ptr = mem_base_addr_ + workspace_tensor->offset_; + std::transform(somas_node->workspace_tensors_.cbegin(), somas_node->workspace_tensors_.cend(), + std::back_inserter(workspace_somas_result), + [](const SomasTensorPtr &tensor) { return std::make_pair(tensor->offset_, tensor->aligned_size_); }); + } else { + MS_LOG(EXCEPTION) << "node [" << common::AnfAlgo::GetCNodeName(node) << "] don't exist in nodes_map"; } - return ptr; -} -#ifndef ENABLE_SECURITY -void Somas::ConvertToProfilingNode(uint32_t graph_id) const { -#ifdef ENABLE_D - auto graph_node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id); - if (graph_node == nullptr) { - graph_node = MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id); - MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id; - } - - for (const auto &tensor : tensors_list_) { - TensorMemory tensor_memory; - tensor_memory.SetTensorId(tensor->GetId()); - tensor_memory.SetAlignedSize(tensor->GetAlignedSize()); - tensor_memory.SetType(tensor_type_name_map[tensor->type_]); - tensor_memory.SetLifeStart(tensor->lifetime_.start_); - tensor_memory.SetLifeEnd(tensor->lifetime_.end_); - tensor_memory.SetLifeLong(life_long_name_map[tensor->lifelong_value_]); - graph_node->AddTensorMemory(tensor_memory); - } - - for (const auto &node : nodes_list_) { - NodeMemory node_memory; - std::string name = GetSplitName(node->scope_full_name_); - node_memory.SetNodeName(name); - node_memory.SetNodeId(node->GetId()); - for (const auto &input_tensor : node->input_tensors_) { - node_memory.AddInputTensorId(input_tensor->GetId()); - } - for (const auto &output_tensor : node->output_tensors_) { - node_memory.AddOutputTensorId(output_tensor->GetId()); - } - for (const auto &workspace_tensor : node->workspace_tensors_) { - node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId()); - } - graph_node->AddNodeMemory(node_memory); - } -#endif + return workspace_somas_result; } SomasStreamPtr Somas::GetSomasStream(size_t stream_id) const { - auto it = std::find_if(streams_list_.begin(), streams_list_.end(), - [stream_id](const SomasStreamPtr &stream) { return stream->GetId() == stream_id; }); - if (it != streams_list_.end()) { - return *(it); + auto it = streams_map_.find(stream_id); + if (it != streams_map_.end()) { + return (*it).second; } else { + MS_LOG(ERROR) << "Can't find somas stream for stream " << stream_id; return nullptr; } } SomasNodePtr Somas::GetSomasNode(size_t node_id) const { - auto it = nodes_id_map_.find(node_id); - if (it == nodes_id_map_.end()) { + if (node_id >= nodes_list_.size()) { return nullptr; } else { - return it->second; + return nodes_list_[node_id]; } } -#endif +common::KernelWithIndex Somas::GetVisitKernelWithReturnType(const AnfNodePtr &ori_node, size_t ori_index) { + auto prenode = common::AnfAlgo::VisitKernelWithReturnType(ori_node, ori_index, false); + while (prenode.first->isa() && nodes_map_.find(prenode.first.get()) == nodes_map_.end()) { + auto anf_node = prenode.first; + auto cnode = anf_node->cast(); + if (!common::AnfAlgo::IsNopNode(cnode)) { + MS_LOG(EXCEPTION) << "Node[" << ori_node->fullname_with_scope() << "] find input node[" + << cnode->fullname_with_scope() << "] doesn't exist in nodes_map and is not a nop node!!!!"; + } + prenode = common::AnfAlgo::VisitKernelWithReturnType(cnode->input(kNopNodeRealInputIndex), 0, false); + } + return prenode; +} } // namespace somas } // namespace mindspore diff --git a/mindspore/ccsrc/backend/common/somas/somas.h b/mindspore/ccsrc/backend/common/somas/somas.h index f91f4dfe9a9..abaaeacac21 100644 --- a/mindspore/ccsrc/backend/common/somas/somas.h +++ b/mindspore/ccsrc/backend/common/somas/somas.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include "utils/hash_map.h" #include "utils/hash_set.h" @@ -33,9 +34,15 @@ #include "backend/common/session/anf_runtime_algorithm.h" #include "include/common/utils/anfalgo.h" #include "backend/common/session/kernel_graph.h" +#include "runtime/hardware/device_type.h" namespace mindspore { namespace somas { +struct EventPair { + CNodePtr send_; + CNodePtr recv_; +}; + union DestinationUnion { size_t id; size_t index; @@ -43,81 +50,86 @@ union DestinationUnion { }; struct TensorConflictInfo { - size_t tensor_id_; - size_t src_node_id_; + size_t tensor_id; + size_t src_node_id; size_t destination_num; DestinationUnion l; DestinationUnion r; TensorConflictInfo(size_t tensor_id, size_t src_node_id) - : tensor_id_(tensor_id), src_node_id_(src_node_id), destination_num(0) {} + : tensor_id(tensor_id), src_node_id(src_node_id), destination_num(0) {} }; + +struct Block { + size_t start_offset_; + size_t size_; + size_t end_offset_; + + Block(size_t start, size_t size) : start_offset_(start), size_(size) { end_offset_ = start_offset_ + size_; } +}; + +void MergeBlocks(std::vector *block_list, std::stack *merged_blocks); + +enum class UnReuseType { kUnReuseAll, kUnReuseInput, kUnReuseOutput, kUnReuseWorkspace }; class Somas { public: // Constructors/Destructors Somas() = default; Somas(const Somas &) = delete; Somas &operator=(const Somas &) = delete; - ~Somas() { mem_base_addr_ = nullptr; } - - bool Allocate(const session::KernelGraph *graph); - const size_t GetTotalMemSize() const { return mem_offset_; } - void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; } - uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const; - uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const; + virtual ~Somas() = default; + bool Assign(const session::KernelGraph &graph); + bool Assign(const KernelGraphPtr &graph_ptr); std::string SomasInfo(bool calc_hash = false) const; - std::string SomasMemory() const; - void DumpSomasInfoIR(const string filename) const; - void DumpSomasMemoryIR(const string &filename) const; - - static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2); #ifndef ENABLE_SECURITY - void ConvertToProfilingNode(uint32_t graph_id) const; + virtual void ConvertToProfilingNode(uint32_t graph_id) const {} #endif private: + // device implementation interface + virtual bool Initialize() = 0; + virtual string GetDeviceName() const = 0; + virtual size_t GetAlignSize(size_t original_size) const = 0; + virtual size_t GetCommunicationReservedSize() const; + + virtual bool GetEnableCacheFlag(const session::KernelGraph &graph) const; + virtual std::vector> GetStreamGroupInfo(const session::KernelGraph &graph) const; + virtual bool GetDependExecOrderFlag(const session::KernelGraph &graph) const = 0; + virtual std::pair GetDebugConfig() const; + + virtual std::map GetUnReuseNodeType(const session::KernelGraph &graph) const; + virtual std::map GetUnReuseNodeName(const session::KernelGraph &graph) const; + + virtual bool InitDevSpecControlTensors(const session::KernelGraph &graph) = 0; + virtual bool DevSpecNodeProcess(const session::KernelGraph &graph) = 0; + // end + + // SOMAS Configuration + std::string device_name_{"SOMAS"}; + size_t communication_gap_size_{0}; + + size_t depend_exec_order_{false}; + bool enable_cache_{false}; + bool save_debug_info_{false}; + std::string debug_info_path_; + + std::map un_reuse_node_type_; + std::map un_reuse_node_name_; + // end + std::vector reuse_matrix_; // hash id std::string hash_id_; - // Maps - mindspore::HashMap tensors_map_; - mindspore::HashMap> nodes_map_; - mindspore::HashMap> parameters_map_; - mindspore::HashMap nodes_id_map_; - - // Vectors - std::vector nodes_list_; - std::vector streams_list_; - std::vector tensors_list_; - std::vector parameters_list_; // Stream groups std::vector> streams_groups_; - // event info map - std::map> event_map_; - // Solver TensorsDescMap solver_tensor_desc_map_; SomasSolverPrePtr somas_solver_; - // Contiguous list - std::vector> contiguous_tensors_list_; - - // Ref lists - std::vector> ref_node_constraints_; std::vector> ref_overlap_constraints_; - // total Offset - size_t mem_offset_{0}; - - // Memory base addr - uint8_t *mem_base_addr_{nullptr}; - - // Save debug info - bool save_graphs_{false}; - std::string save_graphs_path_; - // statistic info size_t upper_bound_{0}; size_t lower_bound_{0}; @@ -128,74 +140,147 @@ class Somas { size_t lifelong_start_total_size_{0}; size_t lifelong_end_total_size_{0}; - bool InitSomasTensors(const session::KernelGraph *graph); - void InitBasicInfo(const session::KernelGraph *graph); - void InitSomasStreamAndNode(const session::KernelGraph *graph); - void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph); - void InitSomasInputTensors(const session::KernelGraph *graph); - void InitSomasEventInfos(); - void GetNextOutputProcess(const session::KernelGraph *graph); - void IndependentNodeOutputProcess(const session::KernelGraph *graph); -#ifndef ENABLE_SECURITY - void SummaryInputProcess(const session::KernelGraph *graph); -#endif - void RefNodeProcess(const session::KernelGraph *graph); - void NonTaskSplitProcess(const session::KernelGraph *graph); - void UnReuseNodeProcess(const session::KernelGraph *graph); - SomasTensorPtr CreateGapTensor(size_t gap_tensor_id); - void GenContiguousList(const session::KernelGraph *graph); + std::vector> processed_contiguous_tensors_list_; + // key: contiguous list index with first union tensor; value: contiguous list index with other union tensor + std::map contiguous_list_with_ref_index_map_; - void ComputeConflictPairs(); + bool ConfigSomas(const session::KernelGraph &graph); - bool Assign(const session::KernelGraph *graph); - - std::string Offline() const; - void DumpOfflineIR(const string filename) const; - std::string GetSplitName(const string &scope_name) const; - size_t CalcLowerBound() const; - void GenGraphStatisticInfo(); + // somas model + bool InitSomasModel(const session::KernelGraph &graph); + bool InitBasicInfoFromGraph(const session::KernelGraph &graph); + void InitSomasStreamAndNode(const session::KernelGraph &graph); + void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph &graph); + void InitSomasInputTensors(const session::KernelGraph &graph); + void InitCommonNodeInputs(const CNodePtr &kernel); + void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel); SomasParameterPtr GetSomasParameter(const AnfNodePtr &node, size_t index); SomasParameterPtr CreateSomasParameter(const AnfNodePtr &node, size_t index); - void InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel); - void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel); - void ComputeOneTensorConflicts(const std::shared_ptr &target_tensor, - const std::vector &tensor_conflict_info_list, - const std::vector &destination_node_list, - const vector &nodes_dependency, - std::vector *tensor_relation) const; + void InitControlTensors(const session::KernelGraph &graph); + bool CommonSpecNodeProcess(const session::KernelGraph &graph); + SomasStreamPtr GetSomasStream(size_t stream_id) const; +#ifndef ENABLE_SECURITY + void SummaryInputProcess(const session::KernelGraph &graph); +#endif + void RefNodeProcess(const session::KernelGraph &graph); + void UnReuseNodeProcess(const session::KernelGraph &graph); + void CommunicationNodeProcess(const session::KernelGraph &graph); + void GetContiguousListContainUnionTensor(); + std::map GetRefTensorsInContiguousList(); + common::KernelWithIndex GetVisitKernelWithReturnType(const AnfNodePtr &ori_node, size_t ori_index); + + // conflict matrix + static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2); + void ComputeConflictMatrix(); + void ComputeBasicMatrix(); + static void ComputeOneTensorConflicts(const std::shared_ptr &target_tensor, + const std::vector &tensor_conflict_info, + const std::vector &destination_node_list, + const vector &nodes_dependency, + std::vector *tensor_relation); void ComputeMultiTensorConflicts(const std::vector &target_tensors_list, - const std::vector &tensor_conflict_info_list, + const std::vector &tensor_conflict_info, const std::vector &destination_node_list, const vector &nodes_dependency, std::vector *tensor_relation) const; void UpdateTensorDestinations(); - void UpdateRefTensorsConflict(); - void UpdateRefOverlapTensorsConflicts(); - void UpdateRefTensorsOffset(); - void UpdateContiguousTensorsOffset(const std::map &contiguous_ref_list_map); - void DumpParameters(std::ostringstream &oss) const; - void DumpTensors(std::ostringstream &oss) const; - void DumpNodes(std::ostringstream &oss) const; - std::map GetContiguousListContainRefTensor(); - std::map GetRefTensorsInContiguousList(); - bool SaveSomasResult(const session::KernelGraph *graph); - bool VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const; - bool LoadSomasResult(const session::KernelGraph *graph, const string &filename); - bool UpdateTensorsOffset(const std::vector &tensors_json); - bool CalcSomasModelHash(const session::KernelGraph *graph); - void UpdateInputTensor(SomasNodePtr node, SomasNodePtr pre_somas_node, SomasTensorPtr input_somas_tensor) const; - bool LoadSomasCache(const session::KernelGraph *graph); - SomasStreamPtr GetSomasStream(size_t stream_id) const; - SomasNodePtr GetSomasNode(size_t node_id) const; + void UpdateUnionTensorsConflict(); static void BuildConflictInfo(const std::shared_ptr &tensor, TensorConflictInfo *tensor_conflict_info, std::vector *destination_node_list); static bool CheckIsDependency(const TensorConflictInfo &tensor_conflict_info, const size_t &src_node_id, const vector &nodes_dependency, const std::vector &destination_node_list); void ProcessSemiLifeLongTensor(); + + // solver + bool Solve(const session::KernelGraph &graph); + void UpdateUnionTensorsOffset(); + void UpdateContiguousTensorsOffset(const std::map &contiguous_ref_list_map); + + // cache + bool SaveSomasResult(const session::KernelGraph &graph); + bool VerifySomasResult(const session::KernelGraph &graph, const nlohmann::json &somas_json) const; + bool LoadSomasResult(const session::KernelGraph &graph, const string &filename); + bool UpdateTensorsOffset(const std::vector &tensors_json); + bool CalcSomasModelHash(const session::KernelGraph &graph); + bool LoadSomasCache(const session::KernelGraph &graph); + + // log + std::string Offline() const; + void DumpOfflineIR(const string &filename) const; + size_t CalcLowerBound() const; + void GenGraphStatisticInfo(); + void DumpParameters(std::ostringstream &oss) const; + void DumpTensors(std::ostringstream &oss) const; + void DumpNodes(std::ostringstream &oss) const; + void DumpSomasModelInfo(const string &tag, uint32_t graph_id) const; + + // update graph + std::vector> GetNodeOutputSomasResult(const AnfNodePtr &node) const; + std::vector> GetNodeWorkSpaceSomasResult(const AnfNodePtr &node) const; + bool UpdateSomasResultToGraph(const session::KernelGraph &graph); + + protected: + std::vector parameters_list_; + std::vector control_tensors_list_; + std::vector tensors_list_; + std::vector nodes_list_; + + mindspore::HashMap streams_map_; + mindspore::HashMap> parameters_map_; + mindspore::HashMap> nodes_map_; + + std::vector> union_tensors_list_; + std::vector> contiguous_tensors_list_; + + void AddControlTensor(const SomasNodePtr &from, const SomasNodePtr &to); + void AddControlTensorFromExecOrder(const session::KernelGraph &graph); + void GraphOutputProcess(const session::KernelGraph &graph); + void UpdateContiguousTensorList(); + SomasNodePtr GetSomasNode(size_t node_id) const; + static std::string GetSplitName(const string &scope_name); + + size_t reused_memory_size_{0}; + std::vector> dump_merged_blocks_; }; using SomasPtr = std::shared_ptr; +using SomasCreator = std::function()>; + +// @todo will delete when old runtime remove +class SomasManager { + public: + static SomasManager &Instance() { + static SomasManager instance{}; + return instance; + } + void Register(device::DeviceType device_type, SomasCreator &&creator) { + if (base_map_.find(device_type) == base_map_.end()) { + (void)base_map_.emplace(device_type, creator); + } + } + SomasPtr GetSomas(device::DeviceType device_type) { + auto iter = base_map_.find(device_type); + if (base_map_.end() != iter) { + MS_EXCEPTION_IF_NULL(iter->second); + return (iter->second)(); + } + return nullptr; + } + + private: + std::map base_map_; +}; + +class SomasRegister { + public: + SomasRegister(device::DeviceType device_type, SomasCreator &&creator) { + SomasManager::Instance().Register(device_type, std::move(creator)); + } + ~SomasRegister() = default; +}; + +#define REG_SOMAS(S, T, C) static const somas::SomasRegister g_##S##_reg(T, []() { return std::make_shared(); }); } // namespace somas } // namespace mindspore #endif // MINDSPORE_CCSRC_BACKEND_COMMON_SOMAS_SOMAS_H_ diff --git a/mindspore/ccsrc/backend/common/somas/somas_node.h b/mindspore/ccsrc/backend/common/somas/somas_node.h index f7dda7b7d4f..777cb29537f 100644 --- a/mindspore/ccsrc/backend/common/somas/somas_node.h +++ b/mindspore/ccsrc/backend/common/somas/somas_node.h @@ -39,14 +39,14 @@ class SomasNode { // node's dependency including data dependency and time dependency std::set> ancestor_nodes_; - std::set tensors_; - + // data tensor std::vector input_tensors_; std::vector output_tensors_; std::vector workspace_tensors_; std::map input_parameters_map_; - - mindspore::HashMap anc_stream_max_order_; + // control tensor + std::vector control_input_tensors_; + std::vector control_output_tensors_; // Constructors/Destructors SomasNode(std::string scope_full_name, size_t id, NodeType type, const size_t &stream_id) @@ -57,7 +57,7 @@ class SomasNode { // Accessors const size_t &GetId() const { return id_; } - const size_t GetStreamId() const { return stream_id_; } + const size_t &GetStreamId() const { return stream_id_; } const NodeType &GetType() const { return type_; } private: diff --git a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc index e548912f1fd..8caeda86255 100644 --- a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc +++ b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,7 +98,7 @@ vector SomasSolverPre::CreateTensorsMaps(const TensorsDescMap &t } return vecTensorsMap; } -Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors, +Status SomasSolverPre::Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors, const std::vector *pConstraints, const vector> &continuous_v, bool bVerifySolution, bool ball, SortingType sorting, FittingType fitting, AlgorithmType algorithm) { @@ -198,7 +198,7 @@ Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap return ret; } -void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap &tensors, +void SomasSolverPre::Log(const session::KernelGraph &graph, const TensorsDescMap &tensors, const std::vector *pConstraints, const vector> &continuous_v) const { auto context_ptr = MsContext::GetInstance(); @@ -213,13 +213,13 @@ void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap } void SomasSolverPre::TensorRelationLog(const std::vector *pConstraints, - const session::KernelGraph *graph) const { + const session::KernelGraph &graph) const { MS_LOG(INFO) << "SomasSolver::Log Writing somas_tensor_relation.ir.."; auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); auto save_graphs_path = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); std::string filename = - GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path); + GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path); std::ostringstream oss; for (size_t tid1 = 0; tid1 < pConstraints->size(); tid1++) { oss << 't' << tid1 << ' '; @@ -232,14 +232,14 @@ void SomasSolverPre::TensorRelationLog(const std::vector *pConstr MS_LOG(INFO) << "SomasSolver somas_tensor_relation Log done"; } -void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors, +void SomasSolverPre::SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors, const vector> &continuous_v) const { MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_input.."; auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); auto save_graphs_path = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); std::string filename = - GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path); + GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path); std::ostringstream oss; for (auto &t : tensors) { oss << "T " << t.second->index_ << " " << t.second->size_ << " " << t.second->lifelong_ << std::endl; @@ -256,13 +256,13 @@ void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const Ten MS_LOG(INFO) << "SomasSolver input Log done"; } -void SomasSolverPre::SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const { +void SomasSolverPre::SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const { MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_output_.."; auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); auto save_graphs_path = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); std::string out_filename = - GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path); + GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path); std::ostringstream oss; constexpr size_t contiguous_left = 1; constexpr size_t contiguous_mid = 2; diff --git a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h index a6613974cf3..094d9148e6f 100644 --- a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h +++ b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -184,14 +184,14 @@ class SomasSolverPre { size_t GetMaxOffset() const { return max_offset_; } - Status Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors, + Status Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors, const std::vector *pConstraints, const vector> &continuous_v, bool bVerifySolution, // true -> Check continuous and non overlapping constraints solution bool ball = true, // true -> run full set of heuristics, false -> run single heuristic specified SortingType sorting = kGreaterSizeSmallerIndex, FittingType fitting = kBest, AlgorithmType algorithm = kManyObjects); - void Log(const session::KernelGraph *graph, const TensorsDescMap &tensors, + void Log(const session::KernelGraph &graph, const TensorsDescMap &tensors, const std::vector *pConstraints, const vector> &continuous_v) const; Status CheckTensors(const TensorsDescMap *pTensors, uint32_t index1, uint32_t index2) const; @@ -201,11 +201,11 @@ class SomasSolverPre { private: size_t max_offset_; - void SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors, + void SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors, const vector> &continuous_v) const; - void SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const; + void SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const; vector CreateTensorsMaps(const TensorsDescMap &tensors, size_t total_sol) const; - void TensorRelationLog(const std::vector *pConstraints, const session::KernelGraph *graph) const; + void TensorRelationLog(const std::vector *pConstraints, const session::KernelGraph &graph) const; }; using SomasSolverPrePtr = std::shared_ptr; } // namespace somas diff --git a/mindspore/ccsrc/backend/common/somas/somas_stream.h b/mindspore/ccsrc/backend/common/somas/somas_stream.h index 2108b8345e0..3766c73a48d 100644 --- a/mindspore/ccsrc/backend/common/somas/somas_stream.h +++ b/mindspore/ccsrc/backend/common/somas/somas_stream.h @@ -31,7 +31,7 @@ class SomasStream { std::vector nodes_; // Constructors/Destructors - explicit SomasStream(int64_t id) : id_(id) {} + explicit SomasStream(size_t id) : id_(id) {} SomasStream(const SomasStream &) = delete; SomasStream &operator=(const SomasStream &) = delete; ~SomasStream() = default; diff --git a/mindspore/ccsrc/backend/common/somas/somas_tensor.cc b/mindspore/ccsrc/backend/common/somas/somas_tensor.cc index 960aa94f912..41bd88479c2 100644 --- a/mindspore/ccsrc/backend/common/somas/somas_tensor.cc +++ b/mindspore/ccsrc/backend/common/somas/somas_tensor.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,25 +15,35 @@ */ #include "backend/common/somas/somas_tensor.h" +#include +#include namespace mindspore { namespace somas { -SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size, - LifeLongType lifelong_value) - : lifelong_value_(lifelong_value), - between_streams_(false), +std::map tensor_type_name_map = { + {kCommon, "Common"}, {kWorkspace, "Workspace"}, + {kOutputOnly, "OutputOnly"}, {kGraphOutput, "GraphOutput"}, + {kGraphInput, "GraphInput"}, {kSummaryInput, "SummaryInput"}, + {kUnion, "Union"}, {kControl, "Control"}, + {kUnknown, "Unknown"}}; + +std::map life_long_name_map = {{kLifeLongNone, "LifeLongNone"}, + {kLifeLongGraphAll, "LifeLongGraphAll"}, + {kLifeLongGraphStart, "LifeLongGraphStart"}, + {kLifeLongGraphEnd, "LifeLongGraphEnd"}}; + +SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size, + size_t aligned_size, LifeLongType lifelong_value) + : aligned_size_(aligned_size), + lifelong_value_(lifelong_value), contiguous_(false), type_(kUnknown), offset_(0), num_constraints_(0), - ref_overlap_(false), id_(id), source_node_id_(source_node_id), source_stream_id_(source_stream_id), - original_size_(real_size) { - const size_t alignment = 512; - const size_t alignment_complement = 31; - aligned_size_ = (real_size > 0) ? ((real_size + alignment + alignment_complement) / alignment) * alignment : 0; + original_size_(ori_size) { solver_tensor_desc_ = std::make_shared(id_, aligned_size_, offset_, false); } @@ -49,5 +59,9 @@ SomasSolverTensorDescPtr SomasTensor::GetSolverTensorDesc() { return solver_tensor_desc_; } } + +std::string SomasTensor::GetTypeString() { return tensor_type_name_map[type_]; } + +std::string SomasTensor::GetLifelongString() { return life_long_name_map[lifelong_value_]; } } // namespace somas } // namespace mindspore diff --git a/mindspore/ccsrc/backend/common/somas/somas_tensor.h b/mindspore/ccsrc/backend/common/somas/somas_tensor.h index 14a7ebe1003..6967c2a4de0 100644 --- a/mindspore/ccsrc/backend/common/somas/somas_tensor.h +++ b/mindspore/ccsrc/backend/common/somas/somas_tensor.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ #include #include #include - +#include #include "utils/hash_map.h" #include "backend/common/somas/somas_solver_pre.h" @@ -38,21 +38,21 @@ using lifetime_t = struct Lifetime; // Tensor type enum TensorType { kCommon, - kOutputOnly, kWorkspace, - kGetNextOutput, + kOutputOnly, + kGraphOutput, + kGraphInput, kSummaryInput, - kRefNodeInput, - kRefNodeOutput, - kEventVirtualOutput, + kUnion, + kControl, kUnknown }; enum LifeLongType { kLifeLongNone, // life time is from tensor start to tensor end - kLifeLongGraphAll, // life time is from graph start to graph end - kLifeLongGraphStart, // life time is from graph start to tensor end - kLifeLongGraphEnd // life time is from tensor start to graph end + kLifeLongGraphAll, // life time is from graph start to graph end + kLifeLongGraphStart, // life time is from graph start to tensor end + kLifeLongGraphEnd // life time is from tensor start to graph end }; class SomasTensor { @@ -60,7 +60,6 @@ class SomasTensor { size_t aligned_size_{0}; LifeLongType lifelong_value_; - bool between_streams_; bool contiguous_; lifetime_t lifetime_; @@ -72,7 +71,7 @@ class SomasTensor { vector consumer_list_; // Constructors/Destructors - explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size, + explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size, size_t aligned_size, LifeLongType lifelong_value = kLifeLongNone); SomasTensor(const SomasTensor &) = delete; SomasTensor &operator=(const SomasTensor &) = delete; @@ -86,14 +85,12 @@ class SomasTensor { const size_t &GetAlignedSize() const { return aligned_size_; } const size_t &GetNumConstraints() const { return num_constraints_; } bool IsLifelong() const { return lifelong_value_ == kLifeLongGraphAll; } - bool IsWorkspace() const { return type_ == kWorkspace; } bool IsOutputOnly() const { return type_ == kOutputOnly; } size_t GetOffset() const { return offset_; } - bool IsBetweenStreams() const { return between_streams_; } bool IsSemiLifelongStart() const { return lifelong_value_ == kLifeLongGraphStart; } bool IsSemiLifelongEnd() const { return lifelong_value_ == kLifeLongGraphEnd; } - bool IsRefOverlap() const { return ref_overlap_; } - + string GetTypeString(); + string GetLifelongString(); // Computing functions void SetOffset() { if (aligned_size_ != 0) { @@ -104,7 +101,6 @@ class SomasTensor { size_t num_constraints_{0}; private: - bool ref_overlap_; const size_t id_{0}; const size_t source_node_id_; const size_t source_stream_id_; diff --git a/mindspore/ccsrc/backend/graph_compiler/backend.cc b/mindspore/ccsrc/backend/graph_compiler/backend.cc index 1e35cbfebd3..7307c9f878e 100644 --- a/mindspore/ccsrc/backend/graph_compiler/backend.cc +++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc @@ -607,8 +607,8 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) { device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_}); MS_EXCEPTION_IF_NULL(device_context); bool all_support = device_context->PartitionGraph(func_graph); + auto run_mode = device_context->GetRunMode(func_graph); if (all_support) { - auto run_mode = device_context->GetRunMode(func_graph); if (run_mode == device::RunMode::kGraphMode) { auto graph_id = graph_compiler_->CompileWholeGraphForGraphRunMode(func_graph, device_context); graph_id_to_device_context_[graph_id] = device_context; @@ -1384,9 +1384,15 @@ std::unique_ptr MindRTBackend::ConstructGraphCompilerInfo(con std::vector *> tensors_mask; std::vector *> input_tensors; + auto strategy = runtime::GraphExecutionStrategy::kPipeline; + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + if (context_ptr->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) { + strategy = runtime::GraphExecutionStrategy::kPipelineWithExecutionOrder; + } return std::make_unique(graphs, device_contexts, tensors_mask, input_tensors, control_nodes_, root_graph->parameters(), parser, outputs_order, outputs_num, name, false, - runtime::GraphExecutionStrategy::kPipeline); + strategy); } std::unique_ptr MindRTBackend::ConstructGraphCompilerInfo( diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc index 2f9c3832d9d..fa803cbf8e5 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc @@ -104,16 +104,6 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m return communication_mem ? alloc_address + kMemAlignSize : alloc_address; } -void AscendMemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) { - MemoryManager::MallocSomasDynamicMem(graph); -#ifndef ENABLE_SECURITY - if (MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) { - MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_); - somas_reuse_util_ptr_->ConvertToProfilingNode(graph.graph_id()); - } -#endif -} - // communication memory: [512align_size + data + 512align_size] // return the pointer to the start of data address. uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) { diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h index aba272e3348..59173feaf75 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h @@ -36,7 +36,6 @@ class AscendMemoryManager : public MemoryManager { void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override; void FreeMemFromMemPool(void *device_ptr) override; uint64_t GetMsMaxMemSize() const; - void MallocSomasDynamicMem(const session::KernelGraph &graph) override; uint8_t *MallocCommunicationMemFromMemPool(size_t size) override; bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size, std::vector size_list) override; diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc new file mode 100644 index 00000000000..402c63f8456 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc @@ -0,0 +1,229 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/ascend/hal/hardware/ascend_somas.h" +#include +#include +#include +#include +#include "backend/common/optimizer/helper.h" +#include "utils/ms_context.h" +#include "plugin/device/ascend/hal/device/ascend_stream_assign.h" +#include "plugin/device/ascend/hal/profiler/memory_profiling.h" + +namespace mindspore { +namespace device { +namespace ascend { +using KernelGraph = session::KernelGraph; +using UnReuseType = somas::UnReuseType; +using TensorType = somas::TensorType; +using LifeLongType = somas::LifeLongType; +using mindspore::profiler::ascend::MemoryProfiling; + +#ifndef ENABLE_SECURITY +void AscendSomas::ConvertToProfilingNode(uint32_t graph_id) const { + if (!MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) { + return; + } + auto graph_node = profiler::ascend::MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id); + if (graph_node == nullptr) { + graph_node = profiler::ascend::MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id); + MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id; + } + + for (const auto &tensor : tensors_list_) { + profiler::ascend::TensorMemory tensor_memory; + tensor_memory.SetTensorId(tensor->GetId()); + tensor_memory.SetAlignedSize(tensor->GetAlignedSize()); + tensor_memory.SetType(tensor->GetTypeString()); + tensor_memory.SetLifeStart(tensor->lifetime_.start_); + tensor_memory.SetLifeEnd(tensor->lifetime_.end_); + tensor_memory.SetLifeLong(tensor->GetLifelongString()); + graph_node->AddTensorMemory(tensor_memory); + } + + for (const auto &node : nodes_list_) { + profiler::ascend::NodeMemory node_memory; + std::string name = GetSplitName(node->scope_full_name_); + node_memory.SetNodeName(name); + node_memory.SetNodeId(node->GetId()); + for (const auto &input_tensor : node->input_tensors_) { + node_memory.AddInputTensorId(input_tensor->GetId()); + } + for (const auto &output_tensor : node->output_tensors_) { + node_memory.AddOutputTensorId(output_tensor->GetId()); + } + for (const auto &workspace_tensor : node->workspace_tensors_) { + node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId()); + } + graph_node->AddNodeMemory(node_memory); + } +} +#endif + +bool AscendSomas::Initialize() { return true; } + +std::string AscendSomas::GetDeviceName() const { return "Ascend"; } + +size_t AscendSomas::GetCommunicationReservedSize() const { + constexpr size_t gap_size = 512; + return gap_size; +} + +size_t AscendSomas::GetAlignSize(size_t original_size) const { + constexpr size_t alignment = 512; + constexpr size_t alignment_complement = 31; + size_t aligned_size = + (original_size > 0) ? ((original_size + alignment + alignment_complement) / alignment) * alignment : 0; + return aligned_size; +} + +bool AscendSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto task_sink = ms_context->get_param(MS_CTX_ENABLE_TASK_SINK); + auto opt_level = ms_context->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL); + if (task_sink || (opt_level == kOptimizeO1)) { + return true; + } else { + return false; + } +} + +std::vector> AscendSomas::GetStreamGroupInfo(const session::KernelGraph &graph) const { + std::vector> stream_group; + stream_group = device::ascend::AscendStreamAssign::GetInstance().get_stream_group(); + return stream_group; +} + +std::map AscendSomas::GetUnReuseNodeType(const session::KernelGraph &graph) const { + std::map node_type; + node_type[kGetNextOpName] = UnReuseType::kUnReuseOutput; + return node_type; +} + +bool AscendSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) { + InitEventInfo(graph); + return true; +} + +void AscendSomas::InitEventInfo(const session::KernelGraph &graph) { + event_map_ = {}; + auto &kernels = graph.execution_order(); + for (const auto &kernel : kernels) { + auto type = common::AnfAlgo::GetCNodeName(kernel); + if (type == kSendOpName) { + auto event = common::AnfAlgo::GetNodeAttr(kernel, kAttrEventId); + auto iter = event_map_.find(event); + if (iter == event_map_.end()) { + auto pair = somas::EventPair(); + pair.send_ = kernel; + event_map_[event] = pair; + } else { + iter->second.send_ = kernel; + } + } else if (type == kRecvOpName) { + auto event = common::AnfAlgo::GetNodeAttr(kernel, kAttrEventId); + auto iter = event_map_.find(event); + if (iter == event_map_.end()) { + auto pair = somas::EventPair(); + pair.recv_ = kernel; + event_map_[event] = pair; + } else { + iter->second.recv_ = kernel; + } + } + } + + for (auto &event : event_map_) { + auto pair = event.second; + auto send_iter = nodes_map_.find(pair.send_.get()); + if (send_iter == nodes_map_.end()) { + MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope(); + continue; + } + + auto recv_iter = nodes_map_.find(pair.recv_.get()); + if (recv_iter == nodes_map_.end()) { + MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope(); + continue; + } + + auto &somas_send = send_iter->second.at(0); + auto &somas_recv = recv_iter->second.at(0); + AddControlTensor(somas_send, somas_recv); + } + MS_LOG(DEBUG) << "Somas InitEventInfo end."; +} + +bool AscendSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { + IndependentNodeOutputProcess(graph); + NonTaskSplitProcess(graph); + return true; +} + +void AscendSomas::IndependentNodeOutputProcess(const session::KernelGraph &graph) { + auto &kernel_cnodes = graph.execution_order(); + size_t total_size = 0; + for (const auto &kernel : kernel_cnodes) { + bool independent = AnfAlgo::IsIndependentNode(kernel); + if (!independent) { + continue; + } + auto iter = nodes_map_.find(kernel.get()); + if (iter != nodes_map_.end()) { + auto &node = iter->second.at(0); + MS_EXCEPTION_IF_NULL(node); + auto semi_reuse_output_tensors = node->output_tensors_; + for (auto &tensor : semi_reuse_output_tensors) { + MS_EXCEPTION_IF_NULL(tensor); + total_size += tensor->GetAlignedSize(); + tensor->lifelong_value_ = LifeLongType::kLifeLongGraphEnd; + } + } + } + + MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size; +} + +void AscendSomas::NonTaskSplitProcess(const session::KernelGraph &graph) { + auto &kernel_cnodes = graph.execution_order(); + for (const auto &kernel : kernel_cnodes) { + auto op_name = common::AnfAlgo::GetCNodeName(kernel); + if (common::AnfAlgo::IsNonTaskOp(kernel)) { + std::vector refnode_input_output; + auto node = nodes_map_[kernel.get()].at(0); + MS_EXCEPTION_IF_NULL(node); + if (node->input_tensors_.empty()) { + MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process."; + } + auto input_tensor = node->input_tensors_[0]; + MS_EXCEPTION_IF_NULL(input_tensor); + input_tensor->type_ = TensorType::kUnion; + refnode_input_output.push_back(input_tensor->GetId()); + + for (auto &output_tensor : node->output_tensors_) { + MS_EXCEPTION_IF_NULL(output_tensor); + output_tensor->type_ = TensorType::kUnion; + refnode_input_output.push_back(output_tensor->GetId()); + } + union_tensors_list_.push_back(refnode_input_output); + } + } +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h new file mode 100644 index 00000000000..d741f2d613e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h @@ -0,0 +1,61 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_ + +#include +#include +#include +#include +#include +#include "backend/common/somas/somas.h" +#include "runtime/hardware/device_type.h" + +namespace mindspore { +namespace device { +namespace ascend { +using KernelGraph = session::KernelGraph; +using UnReuseType = somas::UnReuseType; +class AscendSomas : public somas::Somas { + public: +#ifndef ENABLE_SECURITY + void ConvertToProfilingNode(uint32_t graph_id) const override; +#endif + private: + bool Initialize() override; + string GetDeviceName() const override; + size_t GetCommunicationReservedSize() const override; + size_t GetAlignSize(size_t original_size) const override; + + bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override; + std::vector> GetStreamGroupInfo(const session::KernelGraph &graph) const override; + std::map GetUnReuseNodeType(const session::KernelGraph &graph) const override; + + bool InitDevSpecControlTensors(const session::KernelGraph &graph) override; + bool DevSpecNodeProcess(const session::KernelGraph &graph) override; + + void InitEventInfo(const session::KernelGraph &graph); + void IndependentNodeOutputProcess(const session::KernelGraph &graph); + void NonTaskSplitProcess(const session::KernelGraph &graph); + std::map event_map_; +}; +REG_SOMAS(Ascend, DeviceType::kAscend, AscendSomas) +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc new file mode 100644 index 00000000000..9c108a2dec1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc @@ -0,0 +1,41 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/cpu/hal/hardware/cpu_somas.h" +#include +#include "utils/ms_context.h" + +namespace mindspore { +namespace device { +namespace cpu { +bool CPUSomas::Initialize() { return true; } + +std::string CPUSomas::GetDeviceName() const { return "CPU"; } + +size_t CPUSomas::GetAlignSize(size_t original_size) const { + constexpr size_t alignment = 512; + size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0; + return aligned_size; +} + +bool CPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const { return false; } + +bool CPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) { return true; } + +bool CPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return true; } +} // namespace cpu +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h new file mode 100644 index 00000000000..3df3b0369d7 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__ +#define MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__ + +#include +#include "backend/common/somas/somas.h" +#include "runtime/hardware/device_type.h" + +namespace mindspore { +namespace device { +namespace cpu { +using KernelGraph = session::KernelGraph; +class CPUSomas : public somas::Somas { + private: + bool Initialize() override; + string GetDeviceName() const override; + size_t GetAlignSize(size_t original_size) const override; + + bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override; + bool InitDevSpecControlTensors(const session::KernelGraph &graph) override; + bool DevSpecNodeProcess(const session::KernelGraph &graph) override; +}; +REG_SOMAS(CPU, DeviceType::kCPU, CPUSomas) +} // namespace cpu +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__ diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc index 47af2b2173a..d2f91b0d4e8 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc +++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc @@ -25,6 +25,7 @@ #include "plugin/device/gpu/hal/device/gpu_stream_assign.h" #include "plugin/device/gpu/hal/device/distribution/collective_init.h" #include "plugin/device/gpu/hal/device/gpu_device_manager.h" +#include "plugin/device/gpu/hal/hardware/gpu_somas.h" #include "runtime/data_queue/data_queue_mgr.h" #include "kernel/common_utils.h" #include "plugin/device/gpu/hal/device/gpu_common.h" @@ -40,6 +41,7 @@ #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "backend/common/optimizer/common_backend_optimization.h" #include "backend/common/optimizer/dynamic_shape/dynamic_shape_helper.h" +#include "include/common/debug/anf_ir_dump.h" #ifdef ENABLE_DUMP_IR #include "include/common/debug/rdr/recorder_manager.h" #include "debug/rdr/mem_address_recorder.h" @@ -258,6 +260,25 @@ DeviceAddressPtr GPUDeviceResManager::CreateDeviceAddress(void *const device_ptr return device_address; } +void GPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + auto kernel_graph = graph->cast(); + MS_EXCEPTION_IF_NULL(kernel_graph); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (ms_context->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) { + auto somas = std::make_shared(); + bool ret = somas->Assign(kernel_graph); + if (ret) { + MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id() + << " somas size: " << kernel_graph->somas_whole_block_size(); + } else { + MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id(); + } + } + MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id(); +} + void GPUKernelExecutor::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) const { MS_EXCEPTION_IF_NULL(graph); // Operator fusion optimization. diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h index a0b670a4a62..0c1edc10db6 100644 --- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h +++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h @@ -82,6 +82,8 @@ class GPUKernelExecutor : public DeprecatedKernelExecutor { void CreateKernel(const std::vector &nodes) const override; + void PreprocessBeforeRun(const FuncGraphPtr &graph) const override; + bool LaunchKernel(const CNodePtr &kernel, const std::vector &inputs, const std::vector &workspace, const std::vector &outputs) const override; diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc new file mode 100644 index 00000000000..2a477dc78bd --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc @@ -0,0 +1,141 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/device/gpu/hal/hardware/gpu_somas.h" +#include +#include +#include "backend/common/optimizer/helper.h" +#include "utils/ms_context.h" + +namespace mindspore { +namespace device { +namespace gpu { +bool GPUSomas::Initialize() { return true; } + +std::string GPUSomas::GetDeviceName() const { return "GPU"; } + +size_t GPUSomas::GetAlignSize(size_t original_size) const { + constexpr size_t alignment = 512; + size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0; + return aligned_size; +} + +bool GPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + if (context_ptr->get_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) { + return true; + } else { + return false; + } +} + +bool GPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) { + InitEventInfo(graph); + return true; +} + +void GPUSomas::InitEventInfo(const session::KernelGraph &graph) { + event_map_ = {}; + auto &kernels = graph.execution_order(); + for (const auto &kernel : kernels) { + auto type = common::AnfAlgo::GetCNodeName(kernel); + if (type == kSendOpName) { + auto event = common::AnfAlgo::GetNodeAttr(kernel, kAttrRecordEvent); + auto iter = event_map_.find(event); + if (iter == event_map_.end()) { + auto pair = somas::EventPair(); + pair.send_ = kernel; + event_map_[event] = pair; + } else { + iter->second.send_ = kernel; + } + } else if (type == kRecvOpName) { + auto event = common::AnfAlgo::GetNodeAttr(kernel, kAttrWaitEvent); + auto iter = event_map_.find(event); + if (iter == event_map_.end()) { + auto pair = somas::EventPair(); + pair.recv_ = kernel; + event_map_[event] = pair; + } else { + iter->second.recv_ = kernel; + } + } + } + + for (auto &event : event_map_) { + auto pair = event.second; + auto send_iter = nodes_map_.find(pair.send_.get()); + if (send_iter == nodes_map_.end()) { + MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope(); + continue; + } + + auto recv_iter = nodes_map_.find(pair.recv_.get()); + if (recv_iter == nodes_map_.end()) { + MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope(); + continue; + } + + auto &somas_send = send_iter->second.at(0); + auto &somas_recv = recv_iter->second.at(0); + AddControlTensor(somas_send, somas_recv); + } + MS_LOG(DEBUG) << "Somas InitEventInfo end."; +} + +bool GPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return InplaceNodeProcess(graph); } + +bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) { + auto &kernels = graph.execution_order(); + for (auto &kernel : kernels) { + if (!common::AnfAlgo::IsInplaceNode(kernel, "skip")) { + continue; + } + auto iter = nodes_map_.find(kernel.get()); + if (iter != nodes_map_.end()) { + auto &node = iter->second.at(0); + MS_EXCEPTION_IF_NULL(node); + auto input_tensors = node->input_tensors_; + auto output_tensors = node->output_tensors_; + std::vector union_tensors; + union_tensors.insert(union_tensors.end(), input_tensors.begin(), input_tensors.end()); + union_tensors.insert(union_tensors.end(), output_tensors.begin(), output_tensors.end()); + // check whether the union tensor already in other union tensors + for (auto &tensor : union_tensors) { + auto tensor_id = tensor->GetId(); + for (auto &union_list : union_tensors_list_) { + if (std::count(union_list.begin(), union_list.end(), tensor_id)) { + MS_LOG(EXCEPTION) << "Inplace node union Tensor " << tensor_id << " already in other union tensor list."; + } + } + } + std::vector inplace_union_tensor_list; + for (auto &tensor : union_tensors) { + tensor->type_ = somas::kUnion; + inplace_union_tensor_list.push_back(tensor->GetId()); + } + + union_tensors_list_.push_back(inplace_union_tensor_list); + } else { + MS_LOG(EXCEPTION) << "Can't find somas node for inplace node " << kernel->fullname_with_scope(); + } + } + return true; +} +} // namespace gpu +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h new file mode 100644 index 00000000000..8f64a1c5621 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h @@ -0,0 +1,48 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__ +#define MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__ + +#include +#include +#include "backend/common/somas/somas.h" +#include "runtime/hardware/device_type.h" + +namespace mindspore { +namespace device { +namespace gpu { +using KernelGraph = session::KernelGraph; + +class GPUSomas : public somas::Somas { + private: + bool Initialize() override; + string GetDeviceName() const override; + size_t GetAlignSize(size_t original_size) const override; + + bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override; + bool InitDevSpecControlTensors(const session::KernelGraph &graph) override; + bool DevSpecNodeProcess(const session::KernelGraph &graph) override; + bool InplaceNodeProcess(const session::KernelGraph &graph); + void InitEventInfo(const session::KernelGraph &graph); + std::map event_map_; +}; +REG_SOMAS(GPU, DeviceType::kGPU, GPUSomas) +} // namespace gpu +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__ diff --git a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc index 53d6b252849..908bf9ab3f0 100644 --- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc +++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc @@ -101,7 +101,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) { .value("graph_kernel_flags", MsCtxParam::MS_CTX_GRAPH_KERNEL_FLAGS) .value("grad_for_scalar", MsCtxParam::MS_CTX_GRAD_FOR_SCALAR) .value("pynative_synchronize", MsCtxParam::MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE) - .value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM); + .value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM) + .value("memory_optimize_level", MsCtxParam::MS_CTX_MEMORY_OPTIMIZE_LEVEL); (void)py::class_>(*m, "MSContext") .def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.") .def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified parameter.") diff --git a/mindspore/ccsrc/runtime/device/CMakeLists.txt b/mindspore/ccsrc/runtime/device/CMakeLists.txt index eacb106bdd7..d73d8f167b9 100644 --- a/mindspore/ccsrc/runtime/device/CMakeLists.txt +++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt @@ -3,6 +3,7 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/* "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" "memory_scheduler.cc" "memory_offload_strategy.cc" "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "tensor_array.cc" "ms_device_shape_transfer.cc" "context_extends.cc" "stream_synchronizer.cc" "tensors_queue.cc" "auto_mem_offload.cc" + "common_somas_allocator.cc" ) if("${ENABLE_HIDDEN}" STREQUAL "OFF") diff --git a/mindspore/ccsrc/runtime/device/common_somas_allocator.cc b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc new file mode 100644 index 00000000000..0f8b62da569 --- /dev/null +++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc @@ -0,0 +1,86 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "runtime/device/common_somas_allocator.h" +#include +#include +#include "backend/common/optimizer/helper.h" +#include "utils/ms_context.h" +#ifdef ENABLE_DUMP_IR +#include "debug/rdr/string_recorder.h" +#endif + +namespace mindspore { +namespace device { +bool CommonSomasAllocator::Assign(const session::KernelGraph &graph) { + somas::SomasPtr somas_ptr{nullptr}; + if (GetTargetFromContext() == kAscendDevice) { + somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kAscend); + } else if (GetTargetFromContext() == kGPUDevice) { + somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kGPU); + } else { + somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kCPU); + } + MS_EXCEPTION_IF_NULL(somas_ptr); + bool ret = somas_ptr->Assign(graph); + if (ret) { +#ifdef ENABLE_DUMP_IR + SubModuleId module = SubModuleId::SM_OPTIMIZER; + std::string name = "somas_allocate_info." + std::to_string(graph.graph_id()); + (void)mindspore::RDR::RecordString(module, name, somas_ptr->SomasInfo()); +#endif +#ifndef ENABLE_SECURITY + somas_ptr->ConvertToProfilingNode(graph.graph_id()); +#endif + } + return ret; +} + +uint8_t *CommonSomasAllocator::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const { + MS_EXCEPTION_IF_NULL(node); + auto kernel_info = dynamic_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + if (index >= kernel_info->somas_output_offset_aligned_size_list().size()) { + MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:[" + << kernel_info->somas_output_offset_aligned_size_list().size() << "]"; + } + auto somas_offset_aligned_size = kernel_info->somas_output_offset_aligned_size_list()[index]; + if (somas_offset_aligned_size.second == 0) { + return nullptr; + } + auto somas_offset = somas_offset_aligned_size.first; + uint8_t *ptr = mem_base_addr_ + somas_offset; + return ptr; +} + +uint8_t *CommonSomasAllocator::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const { + MS_EXCEPTION_IF_NULL(node); + auto kernel_info = dynamic_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + if (index >= kernel_info->somas_workspace_offset_aligned_size_list().size()) { + MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:[" + << kernel_info->somas_workspace_offset_aligned_size_list().size() << "]"; + } + auto somas_offset_aligned_size = kernel_info->somas_workspace_offset_aligned_size_list()[index]; + if (somas_offset_aligned_size.second == 0) { + return nullptr; + } + auto somas_offset = somas_offset_aligned_size.first; + uint8_t *ptr = mem_base_addr_ + somas_offset; + return ptr; +} +} // namespace device +} // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/common_somas_allocator.h b/mindspore/ccsrc/runtime/device/common_somas_allocator.h new file mode 100644 index 00000000000..e3c796c741a --- /dev/null +++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.h @@ -0,0 +1,50 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H + +#include +#include +#include +#include +#include +#include "backend/common/somas/somas.h" +#include "runtime/hardware/device_type.h" +#include "utils/ms_context.h" + +namespace mindspore { +namespace device { +class CommonSomasAllocator { + public: + void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; } + static bool Assign(const session::KernelGraph &graph); + uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const; + uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const; + + private: + // Memory base addr + uint8_t *mem_base_addr_{nullptr}; + static std::string GetTargetFromContext() { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + return context_ptr->get_param(MS_CTX_DEVICE_TARGET); + } +}; +using CommonSomasAllocatorPtr = std::shared_ptr; +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H diff --git a/mindspore/ccsrc/runtime/device/kernel_info.cc b/mindspore/ccsrc/runtime/device/kernel_info.cc index 111dac5c5c6..0e9058984c7 100644 --- a/mindspore/ccsrc/runtime/device/kernel_info.cc +++ b/mindspore/ccsrc/runtime/device/kernel_info.cc @@ -15,6 +15,7 @@ */ #include "runtime/device/kernel_info.h" +#include namespace mindspore { namespace device { @@ -108,6 +109,13 @@ bool KernelInfo::SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t return true; } +bool KernelInfo::SetSomasResult(std::vector> &&output_somas_result, + std::vector> &&workspace_somas_result) { + somas_output_result_ = std::move(output_somas_result); + somas_workspace_result_ = std::move(workspace_somas_result); + return true; +} + void KernelInfo::set_kernel_mod(const kernel::KernelModPtr &kernel_mod) { kernel_mod_ = kernel_mod; } kernel::KernelMod *KernelInfo::MutableKernelMod() const { return kernel_mod_.get(); } diff --git a/mindspore/ccsrc/runtime/device/kernel_info.h b/mindspore/ccsrc/runtime/device/kernel_info.h index 7e2ef6802e7..9c8dbf5dc12 100644 --- a/mindspore/ccsrc/runtime/device/kernel_info.h +++ b/mindspore/ccsrc/runtime/device/kernel_info.h @@ -19,6 +19,7 @@ #include #include +#include #include "ir/kernel_info_dev.h" #include "kernel/kernel_build_info.h" #include "kernel/kernel.h" @@ -57,6 +58,8 @@ class KernelInfo : public KernelInfoDevice { DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const; bool WorkspaceAddrExist(size_t index) const; bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index); + bool SetSomasResult(std::vector> &&output_somas_result, + std::vector> &&workspace_somas_result); void set_kernel_mod(const kernel::KernelModPtr &kernel_mod); kernel::KernelMod *MutableKernelMod() const; const kernel::KernelMod *kernel_mod() const; @@ -70,6 +73,12 @@ class KernelInfo : public KernelInfoDevice { uint32_t graph_id() const { return graph_id_; } bool operator==(const KernelInfo &other) const; bool is_feature_map() const { return is_feature_map_; } + const std::vector> &somas_output_offset_aligned_size_list() const { + return somas_output_result_; + } + const std::vector> &somas_workspace_offset_aligned_size_list() const { + return somas_workspace_result_; + } const std::vector> &output_address_list() const { return output_address_list_; } const std::vector> &workspace_address_list() const { return workspace_address_list_; } @@ -83,6 +92,12 @@ class KernelInfo : public KernelInfoDevice { kernel::KernelBuildInfoPtr select_kernel_build_info_; std::vector> output_address_list_; std::vector> workspace_address_list_; + // pair : (offset, aligned_size) + // aligned_size of 0 means no memory allocation + std::vector> somas_output_result_; + // pair : (offset, aligned_size) + // aligned_size of 0 means no memory allocation + std::vector> somas_workspace_result_; kernel::KernelModPtr kernel_mod_; // stream_id_ is the index of stream object vector uint32_t stream_id_; diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index ff3d51d548c..93105178a81 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -985,7 +985,12 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type, {node, i}); MS_EXCEPTION_IF_NULL(device_address); uint8_t *ptr = mem_manager_->MallocOutputMem(node, i, type, output_sizes[i], device_address, false); - MS_EXCEPTION_IF_NULL(ptr); + if (ptr == nullptr && type == kSomasReuseDynamicMem) { + MS_LOG(INFO) << "node: " << node->fullname_with_scope() << " could be a RefNode, please check it" + << " output index: " << i << " memory type: " << type; + } else { + MS_EXCEPTION_IF_NULL(ptr); + } device_address->set_host_shape(trans::GetRuntimePaddingShape(node, i)); AnfAlgo::SetOutputAddr(device_address, i, node.get()); } diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc index 77aceb99341..85b47b96f6e 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/memory_manager.cc @@ -18,10 +18,6 @@ #include #include "backend/common/session/anf_runtime_algorithm.h" #include "include/common/utils/anfalgo.h" -#include "include/common/debug/common.h" -#ifdef ENABLE_DUMP_IR -#include "debug/rdr/string_recorder.h" -#endif #include "utils/ms_context.h" namespace mindspore { @@ -37,41 +33,21 @@ size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) { } void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) { - SomasPtr somas_reuse_util_ptr = std::make_shared(); - MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr); - somas_reuse_util_ptr_ = somas_reuse_util_ptr; + SomasAllocatorPtr somas_allocator_ptr = std::make_shared(); + MS_EXCEPTION_IF_NULL(somas_allocator_ptr); + somas_allocator_ptr_ = somas_allocator_ptr; - if (!(somas_reuse_util_ptr->Allocate(&graph))) { + if (!(somas_allocator_ptr->Assign(graph))) { MS_LOG(EXCEPTION) << "Somas Allocate Failed."; } - size_t total_allocated_size = somas_reuse_util_ptr->GetTotalMemSize(); + size_t total_allocated_size = graph.somas_whole_block_size(); MS_LOG(INFO) << "Graph " << graph.graph_id() << ": TotalSomasReuseDynamicSize [" << total_allocated_size << "]"; if (total_allocated_size > 0) { auto base_ptr = MallocDynamicMem(total_allocated_size, false); MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast(base_ptr) << "], End Address [" << static_cast(base_ptr + total_allocated_size) << "]"; - somas_reuse_util_ptr->set_mem_base_addr(base_ptr); - } - - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); -#ifdef ENABLE_DUMP_IR - SubModuleId module = SubModuleId::SM_OPTIMIZER; - - std::string name = "somas_allocate_info." + std::to_string(graph.graph_id()); - (void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasInfo()); - - name = "somas_mem_info." + std::to_string(graph.graph_id()); - (void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasMemory()); -#endif - bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); - if (save_graphs) { - std::string file_path = GetSaveGraphsPathName("somas_allocate_info_" + std::to_string(graph.graph_id()) + ".ir"); - somas_reuse_util_ptr_->DumpSomasInfoIR(file_path); - - std::string mem_file_path = GetSaveGraphsPathName("somas_mem_info_" + std::to_string(graph.graph_id()) + ".ir"); - somas_reuse_util_ptr_->DumpSomasMemoryIR(mem_file_path); + somas_allocator_ptr->set_mem_base_addr(base_ptr); } } @@ -94,8 +70,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me address->communication_ptr_ = ptr - kMemAlignSize; } } else if (type == kSomasReuseDynamicMem) { - MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_); - ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index); + MS_EXCEPTION_IF_NULL(somas_allocator_ptr_); + ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index); } else { ptr = MallocDynamicMem(size, communication_mem); } @@ -109,8 +85,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me } else if (type == kDynamicMem) { ptr = MallocDynamicMem(size, false); } else if (type == kSomasReuseDynamicMem) { - MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_); - ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index); + MS_EXCEPTION_IF_NULL(somas_allocator_ptr_); + ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index); } address->ptr_ = ptr; return ptr; @@ -118,8 +94,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size) { if (type == kSomasReuseDynamicMem) { - MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_); - return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index); + MS_EXCEPTION_IF_NULL(somas_allocator_ptr_); + return somas_allocator_ptr_->GetNodeWorkSpacePtr(node, index); } return MallocDynamicMem(size, false); } diff --git a/mindspore/ccsrc/runtime/device/memory_manager.h b/mindspore/ccsrc/runtime/device/memory_manager.h index d97bbdfca4a..c327ff35e2d 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.h +++ b/mindspore/ccsrc/runtime/device/memory_manager.h @@ -22,14 +22,15 @@ #include #include #include "common/mem_reuse/mem_reuse.h" -#include "backend/common/somas/somas.h" +#include "runtime/device/common_somas_allocator.h" + namespace mindspore { namespace device { enum MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem }; constexpr int kGetAllOuts = -1; constexpr uint64_t kMemAlignSize = 512; constexpr uint64_t kTwiceMemAlignSize = kMemAlignSize << 1; -using SomasPtr = mindspore::somas::SomasPtr; +using SomasAllocatorPtr = mindspore::device::CommonSomasAllocatorPtr; class MemoryManager { public: @@ -80,7 +81,7 @@ class MemoryManager { return MallocStaticMem(size, communication_mem, kInvalidGraphId); } virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem); - SomasPtr somas_reuse_util_ptr_{nullptr}; + SomasAllocatorPtr somas_allocator_ptr_{nullptr}; }; } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc index 0f64d3a08e9..565a18a1137 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc @@ -81,6 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vectorGetPtr() != nullptr) { + continue; + } // Allocate memory through the device context. device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name(), device::AllocatorType::kKernelOutput); auto dev_ptr_list = device_context->device_res_manager_->AllocateContinuousMemory(size_list); diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc index 2c1d9e39484..104a6cc006c 100644 --- a/mindspore/core/utils/ms_context.cc +++ b/mindspore/core/utils/ms_context.cc @@ -102,6 +102,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { set_param(MS_CTX_ENABLE_RECOVERY, false); set_param(MS_CTX_ENABLE_GE_HETEROGENOUS, false); set_param(MS_CTX_DISABLE_FORMAT_TRANSFORM, false); + set_param(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0); uint32_t kDefaultRuntimeNumThreads = 30; uint32_t cpu_core_num = std::thread::hardware_concurrency() - 1; diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h index 4cd7afef3bc..3a9e68b2de3 100644 --- a/mindspore/core/utils/ms_context.h +++ b/mindspore/core/utils/ms_context.h @@ -55,6 +55,8 @@ const char kGpuInferenceDevice[] = "GpuInference"; const char kDavinciDevice[] = "Davinci"; const char KNpuLog[] = "_npu_log"; const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000; +const int kOptimizeO0 = 0; +const int kOptimizeO1 = 1; const std::set kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice}; // The default max available device memory is 1024GB. @@ -98,6 +100,7 @@ enum MsCtxParam : unsigned { // parameter of type int MS_CTX_TYPE_INT_BEGIN = MS_CTX_TYPE_BOOL_END, MS_CTX_EXECUTION_MODE = MS_CTX_TYPE_INT_BEGIN, + MS_CTX_MEMORY_OPTIMIZE_LEVEL, MS_CTX_TYPE_INT_END, // parameter of type uint32 diff --git a/mindspore/lite/src/extendrt/CMakeLists.txt b/mindspore/lite/src/extendrt/CMakeLists.txt index 4e53a2c97d3..4da401546df 100644 --- a/mindspore/lite/src/extendrt/CMakeLists.txt +++ b/mindspore/lite/src/extendrt/CMakeLists.txt @@ -98,7 +98,6 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE) ${CCSRC_DIR}/backend/common/somas/somas_solver_alg.cc ${CCSRC_DIR}/backend/graph_compiler/graph_partition.cc ${CMAKE_CURRENT_SOURCE_DIR}/mock/segment_runner.cc - ${CCSRC_DIR}/runtime/device/auto_mem_offload.cc ${CCSRC_DIR}/runtime/device/ms_device_shape_transfer.cc ${CCSRC_DIR}/runtime/device/kernel_info.cc ${CCSRC_DIR}/runtime/device/convert_tensor_utils.cc @@ -109,6 +108,7 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE) ${CCSRC_DIR}/runtime/device/memory_offload_strategy.cc ${CCSRC_DIR}/runtime/device/memory_manager.cc ${CCSRC_DIR}/runtime/device/auto_mem_offload.cc + ${CCSRC_DIR}/runtime/device/common_somas_allocator.cc ${CCSRC_DIR}/runtime/pynative/op_executor.cc ${CCSRC_DIR}/runtime/pynative/op_runtime_info.cc ${CCSRC_DIR}/runtime/hardware/device_type.cc @@ -117,6 +117,8 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE) ${CCSRC_DIR}/kernel/kernel.cc ${CCSRC_DIR}/kernel/kash/kernel_pack.cc ${CCSRC_DIR}/kernel/oplib/oplib.cc + ${CCSRC_DIR}/common/debug/anf_dump_utils.cc + ${CCSRC_DIR}/common/debug/anf_ir_dump.cc ${CCSRC_DIR}/common/debug/common.cc ${CCSRC_DIR}/common/debug/env_config_parser.cc ${CCSRC_DIR}/common/thread_pool.cc diff --git a/mindspore/python/mindspore/context.py b/mindspore/python/mindspore/context.py index 9282db75b36..5aaf93975bf 100644 --- a/mindspore/python/mindspore/context.py +++ b/mindspore/python/mindspore/context.py @@ -197,6 +197,22 @@ class _Context: f"or context.PYNATIVE_MODE (1), but got {mode}.") self.set_param(ms_ctx_param.mode, mode) + def set_memory_optimize_level(self, memory_optimize_level): + """ + The memory optimize level, support "O0", "O1". + + Args: + target (str): "O0", "O1" + """ + memory_optimize_levels = ["O0", "O1"] + if memory_optimize_level not in memory_optimize_levels: + raise ValueError(f"For 'context.set_context', the argument 'memory_optimize_level' must be one of " + f"{memory_optimize_levels}, but got {memory_optimize_level}.") + if memory_optimize_level == "O0": + self.set_param(ms_ctx_param.memory_optimize_level, 0) + else: + self.set_param(ms_ctx_param.memory_optimize_level, 1) + def set_backend_policy(self, policy): success = self._context_handle.set_backend_policy(policy) if not success: @@ -353,7 +369,8 @@ class _Context: 'mempool_block_size': set_mempool_block_size, 'print_file_path': set_print_file_path, 'env_config_path': set_env_config_path, - 'runtime_num_threads': set_runtime_num_threads + 'runtime_num_threads': set_runtime_num_threads, + 'memory_optimize_level': set_memory_optimize_level } @property diff --git a/tests/st/networks/test_gpu_alexnet.py b/tests/st/networks/test_gpu_alexnet.py index 13561e7b7e2..a2cfd462ac9 100644 --- a/tests/st/networks/test_gpu_alexnet.py +++ b/tests/st/networks/test_gpu_alexnet.py @@ -87,3 +87,30 @@ def test_trainTensor(num_classes=10, epoch=15, batch_size=32): loss = train_network(data, label).asnumpy() losses.append(loss) assert losses[-1] < 0.01 + + +@pytest.mark.level1 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_train_tensor_memory_opt(num_classes=10, epoch=15, batch_size=32): + """ + Feature: Somas GPU kernel by kernel. + Description: AlexNet with Somas GPU kernel by kernel. + Expectation: No exception. + """ + context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1') + net = AlexNet(num_classes) + lr = 0.1 + momentum = 0.9 + optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum, weight_decay=0.0001) + criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + net_with_criterion = WithLossCell(net, criterion) + train_network = TrainOneStepCell(net_with_criterion, optimizer) + train_network.set_train() + losses = [] + for i in range(0, epoch): + data = Tensor(np.ones([batch_size, 3, 227, 227]).astype(np.float32) * 0.01) + label = Tensor(np.ones([batch_size]).astype(np.int32)) + loss = train_network(data, label).asnumpy() + losses.append(loss) + assert losses[-1] < 0.01 diff --git a/tests/st/networks/test_gpu_lenet.py b/tests/st/networks/test_gpu_lenet.py index ca4d21d3601..5e1daa37fdc 100644 --- a/tests/st/networks/test_gpu_lenet.py +++ b/tests/st/networks/test_gpu_lenet.py @@ -150,6 +150,35 @@ def test_train_lenet(): assert losses[-1] < 0.01 +@pytest.mark.level1 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_train_lenet_memory_opt(): + """ + Feature: Somas GPU kernel by kernel. + Description: LeNet with Somas GPU kernel by kernel. + Expectation: No exception. + """ + context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1') + epoch = 100 + net = LeNet() + momentum = 0.9 + learning_rate = multisteplr(epoch, 30) + + optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) + criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + net_with_criterion = WithLossCell(net, criterion) + train_network = TrainOneStepCell(net_with_criterion, optimizer) # optimizer + train_network.set_train() + losses = [] + for i in range(epoch): + data = Tensor(np.ones([net.batch_size, 3, 32, 32]).astype(np.float32) * 0.01) + label = Tensor(np.ones([net.batch_size]).astype(np.int32)) + loss = train_network(data, label).asnumpy() + losses.append(loss) + assert losses[-1] < 0.01 + + def create_dataset(data_path, batch_size=32, repeat_size=1, num_parallel_workers=1): """ diff --git a/tests/st/networks/test_gpu_lstm.py b/tests/st/networks/test_gpu_lstm.py index 4ec063278da..8cf29be06e5 100644 --- a/tests/st/networks/test_gpu_lstm.py +++ b/tests/st/networks/test_gpu_lstm.py @@ -142,3 +142,48 @@ def test_LSTM(): losses.append(loss) print("loss:", loss.asnumpy()) assert (losses[-1].asnumpy() < 0.01) + + +@pytest.mark.level1 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_lstm_memory_opt(): + """ + Feature: Somas GPU kernel by kernel. + Description: LSTM with Somas GPU kernel by kernel. + Expectation: No exception. + """ + context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1') + num_epochs = 5 + embed_size = 100 + num_hiddens = 100 + num_layers = 2 + bidirectional = True + labels = 2 + vocab_size = 252193 + max_len = 500 + + weight = np.ones((vocab_size + 1, embed_size)).astype(np.float32) + + net = SentimentNet(vocab_size=(vocab_size + 1), embed_size=embed_size, + num_hiddens=num_hiddens, num_layers=num_layers, + bidirectional=bidirectional, weight=weight, + labels=labels, batch_size=batch_size) + + learning_rate = 0.1 + momentum = 0.9 + + optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) + criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + net_with_criterion = WithLossCell(net, criterion) + train_network = TrainOneStepCell(net_with_criterion, optimizer) # optimizer + train_network.set_train() + + train_features = Tensor(np.ones([64, max_len]).astype(np.int32)) + train_labels = Tensor(np.ones([64,]).astype(np.int32)[0:64]) + losses = [] + for epoch in range(num_epochs): + loss = train_network(train_features, train_labels) + losses.append(loss) + print("loss:", loss.asnumpy()) + assert (losses[-1].asnumpy() < 0.01) diff --git a/tests/st/networks/test_gpu_resnet.py b/tests/st/networks/test_gpu_resnet.py index de67d16318d..521d0b3d23c 100644 --- a/tests/st/networks/test_gpu_resnet.py +++ b/tests/st/networks/test_gpu_resnet.py @@ -352,6 +352,36 @@ def test_trainTensor(num_classes=10, epoch=8, batch_size=1): assert (losses[-1].asnumpy() < 1) +@pytest.mark.level1 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_train_tensor_memory_opt(num_classes=10, epoch=8, batch_size=1): + """ + Feature: Somas GPU kernel by kernel. + Description: ResNet with Somas GPU kernel by kernel. + Expectation: No exception. + """ + context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1') + net = resnet50(num_classes) + lr = 0.1 + momentum = 0.9 + optimizer = Momentum(filter(lambda x: x.requires_grad, + net.get_parameters()), lr, momentum) + criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + net_with_criterion = WithLossCell(net, criterion) + train_network = TrainOneStepCell( + net_with_criterion, optimizer) # optimizer + train_network.set_train() + losses = [] + for i in range(0, epoch): + data = Tensor(np.ones([batch_size, 3, 224, 224] + ).astype(np.float32) * 0.01) + label = Tensor(np.ones([batch_size]).astype(np.int32)) + loss = train_network(data, label) + losses.append(loss) + assert (losses[-1].asnumpy() < 1) + + @pytest.mark.level2 @pytest.mark.platform_x86_gpu_training @pytest.mark.env_onecard