From 1a59dc37bf8095035b3529836f5bf88ae1c9ad6f Mon Sep 17 00:00:00 2001 From: parastooashtari Date: Tue, 2 Nov 2021 11:00:38 -0400 Subject: [PATCH] add graph execution order history to dump --- .../ccsrc/backend/session/ascend_session.cc | 9 +- .../ccsrc/backend/session/gpu_session.cc | 1 + .../ccsrc/debug/data_dump/cpu_e2e_dump.cc | 26 ++ .../ccsrc/debug/data_dump/cpu_e2e_dump.h | 2 + mindspore/ccsrc/debug/data_dump/e2e_dump.cc | 37 ++- mindspore/ccsrc/debug/data_dump/e2e_dump.h | 2 + mindspore/ccsrc/debug/debug_services.cc | 253 +++++++++++++++--- mindspore/ccsrc/debug/debug_services.h | 21 +- mindspore/ccsrc/debug/debugger/debugger.cc | 4 +- .../debugger/offline_debug/dbg_services.cc | 1 + .../ccsrc/debug/debugger/tensor_summary.h | 1 + mindspore/ccsrc/debug/tensor_data.h | 5 + mindspore/ccsrc/debug/tensor_load.h | 4 +- .../ccsrc/runtime/device/kernel_adjust.cc | 10 +- .../runtime/framework/actor/debug_actor.cc | 2 + tests/st/debugger/dump_test_utils.py | 25 ++ tests/st/dump/dump_test_utils.py | 36 +++ tests/st/dump/test_data_dump.py | 2 +- tests/st/dump/test_multi_root_graph_dump.py | 158 +++++++++++ .../debugger/gpu_tests/dump_test_utils.py | 25 ++ 20 files changed, 561 insertions(+), 63 deletions(-) create mode 100644 tests/st/dump/test_multi_root_graph_dump.py diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index b74e050d189..737c385867a 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -621,6 +621,9 @@ void AscendSession::PostExecuteGraph(const std::shared_ptr &kernel_ debugger_->PostExecute(); } #endif +#ifndef ENABLE_SECURITY + DumpSetup(kernel_graph); +#endif } void AscendSession::ExecuteGraph(const std::shared_ptr &kernel_graph) { Execute(kernel_graph, true); } @@ -1342,11 +1345,6 @@ void AscendSession::Execute(const std::shared_ptr &kernel_graph, bo } auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); -#ifndef ENABLE_SECURITY - if (is_task && is_task_sink) { - DumpSetup(kernel_graph); - } -#endif bool ret_ok = runtime_instance->Run(*kernel_graph, is_task_sink); #ifndef ENABLE_SECURITY if (is_task && is_task_sink) { @@ -1373,6 +1371,7 @@ void AscendSession::DumpSetup(const std::shared_ptr &kernel_graph) void AscendSession::Dump(const std::shared_ptr &kernel_graph) const { MS_LOG(DEBUG) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); + E2eDump::DumpRunIter(kernel_graph, rank_id_); E2eDump::DumpData(kernel_graph.get(), rank_id_); MS_LOG(DEBUG) << "Finish!"; } diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 96cd2269392..1eec34b960d 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -722,6 +722,7 @@ void GPUSession::DumpSetup(const std::shared_ptr &kernel_graph) con void GPUSession::Dump(const std::shared_ptr &kernel_graph) const { if (debugger_->DebuggerBackendEnabled()) { MS_EXCEPTION_IF_NULL(kernel_graph); + E2eDump::DumpRunIter(kernel_graph, rank_id_); E2eDump::DumpData(kernel_graph.get(), rank_id_, debugger_.get()); } else { DumpJsonParser::GetInstance().UpdateDumpIter(); diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc index 17d56601023..c116a0150c8 100644 --- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc @@ -18,6 +18,7 @@ #include #include "backend/session/anf_runtime_algorithm.h" #include "debug/anf_ir_utils.h" +#include "debug/common.h" namespace mindspore { void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) { @@ -39,6 +40,31 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) { } } +void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) { + auto &json_parser = DumpJsonParser::GetInstance(); + if (!(json_parser.e2e_dump_enabled())) { + return; + } + std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/"; + std::string file_name_to_check = + execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv"; + auto real_path = Common::CreatePrefixPath(file_name_to_check); + if (!real_path.has_value()) { + MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed."; + return; + } + std::string file_name = real_path.value(); + ChangeFileMode(file_name, S_IWUSR); + std::ofstream fout(file_name, std::ofstream::app); + if (!fout.is_open()) { + MS_LOG(WARNING) << "Open file for saving graph global execution order failed."; + return; + } + fout << std::to_string(json_parser.cur_dump_iter()) + "\n"; + fout.close(); + ChangeFileMode(file_name, S_IRUSR); +} + void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) { MS_EXCEPTION_IF_NULL(node); std::string kernel_name = GetKernelNodeName(node); diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h index 17d0aad2c2a..ee297ce9a1c 100644 --- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h @@ -35,6 +35,8 @@ class CPUE2eDump { static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id); + static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0); + private: static void DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path); diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index 68d5af02bc0..4159ae2b1ac 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -290,13 +290,9 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m MS_LOG(INFO) << "No need to update iteration for dataset graph."; return; } - if (starting_graph_id == INT32_MAX) { - // Identify the first graph id and not increasing dump iter for the first iteration (initial dump iter = 0). - starting_graph_id = graph_id; - } else { - // In multi network scripts, dump iter is equal to the number of networks that have been run so far. - dump_json_parser.UpdateDumpIter(); - } + + // In multi network scripts, dump iter is equal to the number of networks that have been executed so far. + dump_json_parser.UpdateDumpIter(); } void E2eDump::DumpSetup(const session::KernelGraph *graph) { @@ -308,10 +304,31 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph) { } } -void E2eDump::UpdateIterGPUDump() { - if (starting_graph_id != INT32_MAX) { - DumpJsonParser::GetInstance().UpdateDumpIter(); +void E2eDump::UpdateIterGPUDump() { DumpJsonParser::GetInstance().UpdateDumpIter(); } + +void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) { + auto &json_parser = DumpJsonParser::GetInstance(); + if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) { + return; } + std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/"; + std::string file_name_to_check = + execution_order_path + "/ms_global_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv"; + auto real_path = Common::CreatePrefixPath(file_name_to_check); + if (!real_path.has_value()) { + MS_LOG(WARNING) << "Check file path: " << file_name_to_check << " failed."; + return; + } + std::string file_name = real_path.value(); + ChangeFileMode(file_name, S_IWUSR); + std::ofstream fout(file_name, std::ofstream::app); + if (!fout.is_open()) { + MS_LOG(WARNING) << "Open file for saving graph global execution order failed."; + return; + } + fout << std::to_string(json_parser.cur_dump_iter()) + "\n"; + fout.close(); + ChangeFileMode(file_name, S_IRUSR); } void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) { diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index b99a6b2e284..89153718176 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -39,6 +39,8 @@ class E2eDump { static void UpdateIterGPUDump(); + static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0); + static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr); static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger); diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index 6e2d6bafa1b..64f5cd36cf8 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include "pybind11/embed.h" #include "pybind11/stl.h" #ifdef ONLINE_DBG_MODE @@ -33,8 +34,10 @@ #include "debug/anf_ir_utils.h" #include "backend/session/anf_runtime_algorithm.h" #endif +#include "nlohmann/json.hpp" #include "debug/debugger/tensor_summary.h" #include "utils/file_utils.h" +#include "linux/limits.h" #ifdef ONLINE_DBG_MODE namespace mindspore { #endif @@ -172,23 +175,28 @@ DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_p } #ifdef OFFLINE_DBG_MODE const void *DebugServices::GetPrevTensor(const std::shared_ptr &tensor, bool previous_iter_tensor_needed, - uint32_t *prev_num_elements) { + uint32_t *prev_num_elements, bool *history_not_found) { MS_EXCEPTION_IF_NULL(tensor); const void *previous_tensor_ptr = nullptr; std::shared_ptr tensor_prev; - if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) { + std::tuple rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId()); + if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) { + *history_not_found = 1; + MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId(); + } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) { + // when prev_tensor is not available, the prev iteration is set to UINT32_MAX // read data in offline mode std::vector file_paths; if (!is_sync_mode_) { ConvertReadTensors(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, - std::vector{tensor->GetIteration() - 1}, + std::vector{tensor->GetPrevIteration()}, std::vector{tensor->GetRootGraphId()}, &file_paths); } std::vector> result_list_prev; ReadDumpedTensor(std::vector{tensor->GetName()}, std::vector{tensor->GetSlot()}, std::vector{tensor->GetDeviceId()}, - std::vector{tensor->GetIteration() - 1}, + std::vector{tensor->GetPrevIteration()}, std::vector{tensor->GetRootGraphId()}, std::vector{tensor->GetIsOutput()}, file_paths, &result_list_prev); tensor_prev = result_list_prev[0]; @@ -303,7 +311,7 @@ void DebugServices::ProcessCheckpointsOutofMemory( const std::vector ¶meter_list) { if (no_mem_to_read) { // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in - int32_t oversize_error_code = 8; + int32_t oversize_error_code = ITensorSummary::OUT_OF_MEMORY; for (auto &wp : watchpoints_to_check) { SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp, @@ -313,6 +321,18 @@ void DebugServices::ProcessCheckpointsOutofMemory( } } } + +void DebugServices::SetTensorToNotInUse(const std::shared_ptr &tensor, const void *previous_tensor_ptr) { + // set the tensor into not-in-use status in tensor_loader. + auto tensor_name = tensor->GetName(); + std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" + + std::to_string(tensor->GetRootGraphId()) + ":" + + std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot()); + AppendToCacheEvictQueue(key_name_in_cache); + if (previous_tensor_ptr != nullptr) { + AppendToCacheEvictQueue(key_name_in_cache + ":prev"); + } +} #endif void DebugServices::CheckWatchpointsForTensor( @@ -373,7 +393,8 @@ void DebugServices::CheckWatchpointsForTensor( uint32_t prev_num_elements = 0; const void *previous_tensor_ptr = nullptr; #ifdef OFFLINE_DBG_MODE - previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements); + bool history_not_found = 0; + previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found); #else std::shared_ptr prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name); if (prev_tensor_data) { @@ -400,6 +421,11 @@ void DebugServices::CheckWatchpointsForTensor( auto item = base_summary_ptr->IsWatchpointHit(wp); is_hit = std::get(item); error_code = std::get(item); +#ifdef OFFLINE_DBG_MODE + if (history_not_found) { + error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found + } +#endif parameter_list = std::get(item); } AddAnalyzedTensorToCache(recheck, wp.id, tensor_name); @@ -413,14 +439,7 @@ void DebugServices::CheckWatchpointsForTensor( } #ifdef OFFLINE_DBG_MODE - // set the tensor into not-in-use status in tensor_loader. - std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" + - std::to_string(tensor->GetRootGraphId()) + ":" + - std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot()); - AppendToCacheEvictQueue(key_name_in_cache); - if (previous_tensor_ptr != nullptr) { - AppendToCacheEvictQueue(key_name_in_cache + ":prev"); - } + SetTensorToNotInUse(tensor, previous_tensor_ptr); // in offline mode remove the need for the data tensor.reset(); #endif @@ -679,7 +698,7 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector &f std::string real_dump_iter_dir = RealPath(dump_key); DIR *d_handle = opendir(real_dump_iter_dir.c_str()); if (d_handle == nullptr) { - MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat."; + MS_LOG(ERROR) << "Directory does not exist in ConvertToHostFormat."; return; } struct dirent *dir = nullptr; @@ -859,12 +878,153 @@ void DebugServices::GetTensorDataInfoAsync(const std::vectorSetType(""); tensor_data->SetShape(shape); tensor_data->SetIsOutput(output_flag); + tensor_data->SetPrevIteration(GetPrevIteration(tensor_data)); tensor_list->push_back(tensor_data); } } } +uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) { + std::regex re; + if (mode == "rank") { + re = "^rank_([0-9]+)$"; + } else if (mode == "graph") { + re = "^([0-9]+)$"; + } + std::smatch tokens; + if (regex_match(name, tokens, re)) { + return std::stoi(tokens[1]); + } else { + return UINT32_MAX; + } +} + +std::vector DebugServices::GetDumpRankIdList() { + std::vector rank_id_list; + std::string dump_dir = GetDumpDir(); + DIR *d_handle = opendir(dump_dir.c_str()); + if (d_handle == nullptr) { + MS_LOG(ERROR) << "Dump directory does not exist."; + return rank_id_list; + } + struct dirent *dir = nullptr; + while ((dir = readdir(d_handle)) != nullptr) { + if (dir->d_type == DT_DIR) { + std::string rank_dir_name = dir->d_name; + if (GetRankOrGraphId("rank", rank_dir_name) != UINT32_MAX) { + rank_id_list.push_back(GetRankOrGraphId("rank", rank_dir_name)); + } + } + } + (void)closedir(d_handle); + return rank_id_list; +} + +void DebugServices::CheckDumpGraphIdList(std::vector rank_id_list) { + std::string net_name = GetNetName(); + std::string dump_dir = GetDumpDir(); + for (uint32_t rank_id : rank_id_list) { + std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name; + std::string abspath = RealPath(path); + DIR *d_handle_rank = opendir(abspath.c_str()); + if (d_handle_rank == nullptr) { + MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist."; + continue; + } + struct dirent *direc = nullptr; + while ((direc = readdir(d_handle_rank)) != nullptr) { + if (direc->d_type == DT_DIR) { + std::string graph_dir = direc->d_name; + if (graph_dir == "." || graph_dir == "..") { + continue; + } + if (GetRankOrGraphId("graph", graph_dir) != UINT32_MAX) { + uint32_t graph_id = GetRankOrGraphId("graph", graph_dir); + ReadGraphsHistory(rank_id, graph_id); + } + } + } + (void)closedir(d_handle_rank); + } +} + +void DebugServices::SetGraphsHistory() { + // extract rank_id_list + std::vector rank_id_list = GetDumpRankIdList(); + // for each rank_id extract the graph_id list and set the dump version + // and for each graph read the graph history file + CheckDumpGraphIdList(rank_id_list); +} + +void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) { + std::tuple rank_and_graph(rank_id, root_graph_id); + if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) { + // graph history was already stored for this rank_id and graph_id + return; + } + std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/"; + std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv"; + DIR *d_handle = opendir(exec_order_path.c_str()); + if (d_handle == nullptr) { + MS_LOG(ERROR) << "Directory does not exist."; + return; + } + // read file and store the info + std::string full_path = exec_order_path + "/" + file_to_check; + std::string checked_path = RealPath(full_path); + if (!checked_path.empty()) { + ReadGraphRunIter(checked_path, rank_and_graph); + } + (void)closedir(d_handle); +} + +std::map, std::vector>> DebugServices::GetAllWpNodes() { + std::map, std::vector>> rank_and_graph_to_nodes; + for (auto w_table_item : watchpoint_table_) { + auto wp = std::get<1>(w_table_item); + unsigned int index = 0; + for (auto check_node : wp.check_node_list) { + std::vector ranks = std::get<1>(wp.check_node_device_list[index]); + std::vector graphs = std::get<1>(wp.check_node_graph_list[index]); + // graph represents root_graph for Ascend and kernel_graph for GPU + for (auto rank : ranks) { + for (auto graph : graphs) { + std::tuple key(rank, graph); + (rank_and_graph_to_nodes)[key].push_back(check_node); + } + } + index++; + } + } + return rank_and_graph_to_nodes; +} + +void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple rank_and_graph) { + std::ifstream infile; + std::string line; + infile.open(file_path.c_str()); + if (!infile.is_open()) { + MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno; + const int kMaxFilenameLength = NAME_MAX; + char err_info[kMaxFilenameLength]; + if (strerror_r(errno, err_info, sizeof(err_info)) != nullptr) { + MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info)); + } + + return; + } + std::vector run_iters_vec; + while (std::getline(infile, line)) { + uint32_t iter; + std::stringstream ss(line); + ss >> iter; + run_iters_vec.push_back(iter); + } + (void)graphs_run_history_.emplace( + std::pair, std::vector>(rank_and_graph, run_iters_vec)); +} + void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp, const std::size_t slot, const unsigned int iteration, const unsigned int device_id, const unsigned int root_graph_id, const bool is_output, const std::size_t data_size, @@ -889,6 +1049,7 @@ void DebugServices::AddToTensorData(const std::string &backend_name, const std:: tensor_data->SetType(type_name); tensor_data->SetShape(shape); tensor_data->SetTimeStamp(time_stamp); + tensor_data->SetPrevIteration(GetPrevIteration(tensor_data)); if (data_size) { (void)tensor_loader_->LoadNewTensor(tensor_data, false); } @@ -1080,34 +1241,19 @@ std::vector> DebugServices::ReadNeededDumpedTensors( unsigned int iteration, std::vector *const async_file_pool) { // get a list of nodes and the devices they are on to monitor std::vector> tensor_list; - std::map, std::vector>> device_and_graph_to_nodes; - for (auto w_table_item : watchpoint_table_) { - auto wp = std::get<1>(w_table_item); - unsigned int index = 0; - for (auto check_node : wp.check_node_list) { - std::vector devices = std::get<1>(wp.check_node_device_list[index]); - std::vector graphs = std::get<1>(wp.check_node_graph_list[index]); - for (auto device : devices) { - for (auto graph : graphs) { - std::tuple key(device, graph); - device_and_graph_to_nodes[key].push_back(check_node); - } - } - - index++; - } - } + std::map, std::vector>> rank_and_graph_to_nodes = + GetAllWpNodes(); // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list // as they are found - for (auto const &device_and_graph_item : device_and_graph_to_nodes) { - std::tuple device_and_graph = device_and_graph_item.first; - uint32_t device_id = std::get<0>(device_and_graph); - uint32_t root_graph_id = std::get<1>(device_and_graph); - std::vector> wp_nodes = device_and_graph_item.second; + for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) { + std::tuple rank_and_graph = rank_and_graph_item.first; + uint32_t rank_id = std::get<0>(rank_and_graph); + uint32_t root_graph_id = std::get<1>(rank_and_graph); + std::vector> wp_nodes = rank_and_graph_item.second; std::vector> proto_to_dump; - std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" + + std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" + std::to_string(root_graph_id) + "/" + IterationString(iteration); // convert node names to dump style @@ -1131,12 +1277,11 @@ std::vector> DebugServices::ReadNeededDumpedTensors( if (is_sync_mode_) { // search files in dir for the one that meets the filename prefix and read the file into memory std::string abspath = RealPath(specific_dump_dir); - ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id, - &tensor_list); + ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, rank_id, root_graph_id, &tensor_list); } else { // convert all files in proto_to_dump to npy and add to pool of async file names ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool); - GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool, + GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, rank_id, root_graph_id, *async_file_pool, &tensor_list); } } @@ -1276,6 +1421,32 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr &tensor, boo return tensor_loader_->LoadNewTensor(tensor, keep_prev); } +uint32_t DebugServices::GetPrevIteration(const std::shared_ptr &tensor) { + uint32_t prev_iter; + uint32_t rank_id = tensor->GetDeviceId(); + uint32_t root_graph_id = tensor->GetRootGraphId(); + std::tuple rank_and_graph = std::make_tuple(rank_id, root_graph_id); + if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) { + return UINT32_MAX; + } + auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(), + tensor->GetIteration()); + if (it == graphs_run_history_[rank_and_graph].end()) { + // The graph is not executed in that iteration + return UINT32_MAX; + } else if (it == graphs_run_history_[rank_and_graph].begin()) { + // current iteration is the first iteration that the graph was run + // no prev iter is available + MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration() + << " is the first run iteration for tensor: " << tensor->GetName(); + return UINT32_MAX; + } + it--; + prev_iter = *it; + tensor->SetPrevIteration(prev_iter); + return prev_iter; +} + void DebugServices::ResetLoadedTensors() { wp_id_cache_.clear(); MS_LOG(INFO) << "Resetting loaded tensors"; diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index 3a49da33c22..daa2e0955ea 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -292,6 +292,9 @@ class DebugServices { std::vector *chunk_tensor_byte_size, partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id, std::vector *device_id, std::vector *root_graph_id); +#ifdef OFFLINE_DBG_MODE + void SetTensorToNotInUse(const std::shared_ptr &tensor, const void *previous_tensor_ptr); +#endif void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, const std::shared_ptr &tensor, bool *previous_iter_tensor_needed, @@ -350,7 +353,7 @@ class DebugServices { std::vector *const async_file_pool); const void *GetPrevTensor(const std::shared_ptr &tensor, bool previous_iter_tensor_needed, - uint32_t *prev_num_elements); + uint32_t *prev_num_elements, bool *history_not_found); void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type, std::size_t *const size, std::vector *const shape, @@ -380,6 +383,18 @@ class DebugServices { uint32_t root_graph_id, const std::vector &async_file_pool, std::vector> *const tensor_list); + void SetGraphsHistory(); + + std::vector GetDumpRankIdList(); + + void CheckDumpGraphIdList(std::vector rank_id_list); + + void ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id); + + std::map, std::vector>> GetAllWpNodes(); + + void ReadGraphRunIter(std::string file_path, std::tuple rank_and_graph); + std::string GetStrippedFilename(const std::string &file_name); std::string IterationString(unsigned int iteration); @@ -410,6 +425,8 @@ class DebugServices { bool LoadNewTensor(const std::shared_ptr &tensor, bool keep_prev); + uint32_t GetPrevIteration(const std::shared_ptr &tensor); + void ResetLoadedTensors(); #ifdef ONLINE_DBG_MODE std::vector> GetNodeTensor(const CNodePtr &kernel); @@ -458,6 +475,8 @@ class DebugServices { std::unordered_map> overflow_ops_; std::string net_name_; std::string dump_dir_; + // store history of graphs that have been run (rank_id, graph_id) + std::map, std::vector> graphs_run_history_; bool is_sync_mode_{false}; std::shared_ptr tensor_loader_; diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 4481632de76..7e5207ee25e 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -282,7 +282,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector &graphs if (device_target_ != kGPUDevice) { return; } - E2eDump::UpdateIterGPUDump(); // Store graphs that are run in one step. graph_ptr_step_vec_ = graphs; for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { @@ -290,7 +289,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector &graphs if (debugger_) { debugger_->PreExecute(graph); } - DumpSetup(graph); } } @@ -390,6 +388,7 @@ uint32_t Debugger::GetRankID() { void Debugger::Dump(const KernelGraphPtr &kernel_graph) const { uint32_t rank_id = GetRankID(); + E2eDump::DumpRunIter(kernel_graph, rank_id); if (debugger_ && debugger_->DebuggerBackendEnabled()) { MS_EXCEPTION_IF_NULL(kernel_graph); (void)E2eDump::DumpParametersAndConstData(kernel_graph.get(), rank_id, debugger_.get()); @@ -458,6 +457,7 @@ void Debugger::PostExecuteGraphDebugger() { if (debugger_) { debugger_->PostExecute(); } + E2eDump::UpdateIterGPUDump(); } void Debugger::PostExecute() { diff --git a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc index 77407511944..76d3643fc1d 100644 --- a/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc +++ b/mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc @@ -62,6 +62,7 @@ int32_t DbgServices::Initialize(const std::string net_name, const std::string du auto cache_mem_ratio = 0.5; const uint64_t memlimit = max_mem_usage * kMegabytesToBytes * cache_mem_ratio; debug_services_->SetMemLimit(memlimit); + debug_services_->SetGraphsHistory(); return 0; } diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.h b/mindspore/ccsrc/debug/debugger/tensor_summary.h index 6b9794d434b..e6117bde62b 100644 --- a/mindspore/ccsrc/debug/debugger/tensor_summary.h +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h @@ -88,6 +88,7 @@ class VarianceAndMeanCalculator { class ITensorSummary { public: enum WatchpointPos { eHitPos = 0, eErrorCodePos = 1, eParamListPos = 2 }; + enum ErrorCode { NAN_TENSOR = 0, INF_TENSOR = 2, NULL_PREV_TENSOR = 4, OUT_OF_MEMORY = 8, HISTORY_NOT_FOUND = 16 }; virtual ~ITensorSummary() = default; virtual void SummarizeTensor(const std::vector &) = 0; virtual std::tuple> IsWatchpointHit( diff --git a/mindspore/ccsrc/debug/tensor_data.h b/mindspore/ccsrc/debug/tensor_data.h index e8cfe33503c..46e4a1190e7 100644 --- a/mindspore/ccsrc/debug/tensor_data.h +++ b/mindspore/ccsrc/debug/tensor_data.h @@ -229,6 +229,10 @@ class TensorData { void SetIteration(unsigned int iteration) { this->iteration_ = iteration; } + unsigned int GetPrevIteration() const { return this->prev_iteration_; } + + void SetPrevIteration(unsigned int prev_iteration) { this->prev_iteration_ = prev_iteration; } + unsigned int GetDeviceId() const { return this->device_id_; } void SetDeviceId(unsigned int device_id) { this->device_id_ = device_id; } @@ -430,6 +434,7 @@ class TensorData { std::string name_; uint64_t slot_; unsigned int iteration_{0}; + unsigned int prev_iteration_{0}; unsigned int device_id_{0}; unsigned int root_graph_id_{0}; bool is_output_{true}; diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index 82816c8c0ed..7ce1e5bc2ed 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2019-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -90,7 +90,7 @@ class TensorLoader { key_name += (":" + std::to_string(tensor->GetDeviceId()) + ":" + std::to_string(tensor->GetRootGraphId()) + ":" + std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot())); if (tensor_list_map_.find(key_name) != tensor_list_map_.end() && - tensor->GetIteration() == tensor_list_map_[key_name]->GetIteration() - 1) { + tensor->GetIteration() == tensor_list_map_[key_name]->GetPrevIteration()) { key_name += ":prev"; } auto iter = tensor_list_map_.find(key_name); diff --git a/mindspore/ccsrc/runtime/device/kernel_adjust.cc b/mindspore/ccsrc/runtime/device/kernel_adjust.cc index e0440eb0a02..d672268c1ba 100644 --- a/mindspore/ccsrc/runtime/device/kernel_adjust.cc +++ b/mindspore/ccsrc/runtime/device/kernel_adjust.cc @@ -33,6 +33,9 @@ #include "runtime/base.h" #include "runtime/device/ascend/ascend_stream_manager.h" #include "utils/shape_utils.h" +#ifndef ENABLE_SECURITY +#include "debug/data_dump/dump_json_parser.h" +#endif namespace { constexpr auto kGradients = "Gradients"; @@ -1049,7 +1052,12 @@ void KernelAdjust::LoadDeviceLoopCtrlParameters(const std::shared_ptrcurrent_epoch())); +#ifndef ENABLE_SECURITY + SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, + SizeToInt(DumpJsonParser::GetInstance().cur_dump_iter())); +#else + SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, 0); +#endif kernel_graph_ptr->set_current_epoch(kernel_graph_ptr->current_epoch() + 1); } diff --git a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc index e8b2b75b01f..6ec4f51c298 100644 --- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc @@ -23,6 +23,7 @@ #include "utils/log_adapter.h" #ifndef ENABLE_SECURITY #include "debug/data_dump/cpu_e2e_dump.h" +#include "debug/data_dump/e2e_dump.h" #endif #ifdef ENABLE_DEBUGGER #include "debug/debugger/debugger.h" @@ -52,6 +53,7 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); MS_EXCEPTION_IF_NULL(kernel_graph); CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id()); + CPUE2eDump::DumpRunIter(kernel_graph); } #endif } else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) { diff --git a/tests/st/debugger/dump_test_utils.py b/tests/st/debugger/dump_test_utils.py index 77a41113fde..e5e9828f6f5 100644 --- a/tests/st/debugger/dump_test_utils.py +++ b/tests/st/debugger/dump_test_utils.py @@ -18,11 +18,14 @@ Utils for testing offline debugger. import os import tempfile +import bisect +import csv import numpy as np def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_info_list): """Build dump file structure from tensor_list.""" + ranks_run_history = {} temp_dir = tempfile.mkdtemp(prefix=net_name, dir=path) for tensor_name, tensor, tensor_info in zip(tensor_name_list, tensor_list, tensor_info_list): slot = str(tensor_info.slot) @@ -30,6 +33,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i rank_id = str(tensor_info.rank_id) root_graph_id = str(tensor_info.root_graph_id) is_output = str(tensor_info.is_output) + if rank_id not in ranks_run_history: + graphs_run_history = {} + ranks_run_history[rank_id] = graphs_run_history + if root_graph_id not in ranks_run_history[rank_id]: + iter_list = [] + iter_list.append(iteration) + graphs_run_history[root_graph_id] = iter_list + elif iteration not in graphs_run_history[root_graph_id]: + bisect.insort(graphs_run_history[root_graph_id], iteration) + path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration) os.makedirs(path, exist_ok=True) if is_output == "True": @@ -40,4 +53,16 @@ def build_dump_structure(path, tensor_name_list, tensor_list, net_name, tensor_i ".DefaultFormat.npy", dir=path) full_path = file[1] np.save(full_path, tensor) + build_global_execution_order(temp_dir, ranks_run_history) return temp_dir + + +def build_global_execution_order(path, ranks_run_history): + for rank_id in ranks_run_history.keys(): + exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order" + os.makedirs(exec_order_path, exist_ok=True) + for graph in ranks_run_history[rank_id].keys(): + full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv") + with open(full_path, 'w+', newline='') as csv_file: + write = csv.writer(csv_file) + write.writerows(ranks_run_history[rank_id][graph]) diff --git a/tests/st/dump/dump_test_utils.py b/tests/st/dump/dump_test_utils.py index 3b3a0d64078..550adad8828 100644 --- a/tests/st/dump/dump_test_utils.py +++ b/tests/st/dump/dump_test_utils.py @@ -69,6 +69,35 @@ async_dump_dict_2 = { } } +e2e_dump_dict_2 = { + "common_dump_settings": { + "dump_mode": 0, + "path": "", + "net_name": "Net", + "iteration": "all", + "input_output": 0, + "kernels": ["Default/Conv-op12"], + "support_device": [0, 1, 2, 3, 4, 5, 6, 7], + "op_debug_mode": 0 + }, + "e2e_dump_settings": { + "enable": True, + "trans_flag": False + } +} + +async_dump_dict_3 = { + "common_dump_settings": { + "dump_mode": 0, + "path": "", + "net_name": "Net", + "iteration": "all", + "input_output": 2, + "kernels": ["Default/TensorAdd-op3"], + "support_device": [0, 1, 2, 3, 4, 5, 6, 7], + "op_debug_mode": 0 + } +} def generate_dump_json(dump_path, json_file_name, test_key): """ @@ -83,6 +112,13 @@ def generate_dump_json(dump_path, json_file_name, test_key): elif test_key == "test_async_dump_net_multi_layer_mode1": data = async_dump_dict_2 data["common_dump_settings"]["path"] = dump_path + elif test_key in ("test_GPU_e2e_multi_root_graph_dump", "test_Ascend_e2e_multi_root_graph_dump"): + data = e2e_dump_dict_2 + data["common_dump_settings"]["path"] = dump_path + elif test_key == "test_Ascend_async_multi_root_graph_dump": + data = async_dump_dict_3 + data["common_dump_settings"]["path"] = dump_path + else: raise ValueError( "Failed to generate dump json file. The test name value " + test_key + " is invalid.") diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py index 791078e8eac..2d676baa34f 100644 --- a/tests/st/dump/test_data_dump.py +++ b/tests/st/dump/test_data_dump.py @@ -283,7 +283,7 @@ def run_e2e_dump_execution_graph(): add = Net() add(Tensor(x), Tensor(y)) exe_graph_path = os.path.join(dump_path, 'rank_0', 'execution_order') - assert len(os.listdir(exe_graph_path)) == 1 + assert len(os.listdir(exe_graph_path)) == 2 del os.environ['MINDSPORE_DUMP_CONFIG'] diff --git a/tests/st/dump/test_multi_root_graph_dump.py b/tests/st/dump/test_multi_root_graph_dump.py new file mode 100644 index 00000000000..2f483c3b945 --- /dev/null +++ b/tests/st/dump/test_multi_root_graph_dump.py @@ -0,0 +1,158 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import os +from os import path +import tempfile +import time +import shutil +import csv +import numpy as np +import pytest +import mindspore.context as context +from mindspore import Tensor +from mindspore.ops import operations as P +from mindspore.nn import Cell +from dump_test_utils import generate_dump_json +from tests.security_utils import security_off_wrap + + +class AddNet(Cell): + def __init__(self): + super(AddNet, self).__init__() + self.add = P.TensorAdd() + + def construct(self, input_x, input_y): + output_z = self.add(input_x, input_y) + return output_z + + +class NewAddNet(Cell): + def __init__(self): + super(NewAddNet, self).__init__() + self.add = P.AddN() + + def construct(self, x, y): + z = self.add([x, y, y]) + return z + + +def train_addnet(epoch): + net = AddNet() + net2 = NewAddNet() + output_list = [] + input_x = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32)) + input_y = Tensor(np.ones([2, 1, 2, 1]).astype(np.float32)) + for _ in range(epoch): + out_put = net(input_x, input_y) + out2 = net2(out_put, input_x) + output_list.append(out2.asnumpy()) + input_x = input_x + input_y + + +def run_multi_root_graph_dump(device, dump_mode, test_name): + """Run dump for multi root graph script.""" + + context.set_context(mode=context.GRAPH_MODE, device_target=device) + + with tempfile.TemporaryDirectory(dir='/tmp') as tmp_dir: + dump_path = os.path.join(tmp_dir, dump_mode) + dump_config_path = os.path.join(tmp_dir, dump_mode + ".json") + generate_dump_json(dump_path, dump_config_path, test_name) + os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path + dump_file_path = os.path.join(dump_path, 'rank_0', 'Net') + if os.path.isdir(dump_path): + shutil.rmtree(dump_path) + epoch = 3 + train_addnet(epoch) + for _ in range(3): + if not os.path.exists(dump_file_path): + time.sleep(2) + # Multi root graph script : we have 2 graphs under rank_0 dir + # Each graph should have 3 iteration + # Each graph was executed once per epoch, + # Graph 0 was executed in even iterations, graph one was executed in odd iterations + assert len(os.listdir(dump_file_path)) == 2 + dump_path_graph_0 = os.path.join(dump_file_path, '0') + dump_path_graph_1 = os.path.join(dump_file_path, '1') + assert sorted(os.listdir(dump_path_graph_0)) == ['0', '2', '4'] + assert sorted(os.listdir(dump_path_graph_1)) == ['1', '3', '5'] + execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order') + # Four files in execution_order dir. + # Two files for each graph (ms_execution_order and ms_global_execution_order) + assert len(os.listdir(execution_order_path)) == 4 + global_exec_order_graph_0 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_0.csv') + assert path.exists(global_exec_order_graph_0) + with open(global_exec_order_graph_0) as csvfile: + history_graph_0 = csv.reader(csvfile) + iter_list_graph_0 = list(history_graph_0) + assert iter_list_graph_0 == [['0'], ['2'], ['4']] + global_exec_order_graph_1 = os.path.join(execution_order_path, 'ms_global_execution_order_graph_1.csv') + assert path.exists(global_exec_order_graph_1) + with open(global_exec_order_graph_1) as csvfile: + history_graph_1 = csv.reader(csvfile) + iter_list_graph_1 = list(history_graph_1) + assert iter_list_graph_1 == [['1'], ['3'], ['5']] + + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +@security_off_wrap +def test_GPU_e2e_multi_root_graph_dump(): + """ + Feature: + Multi root graph e2e dump for GPU. + Description: + Test multi root graph e2e dump GPU. + Expectation: + Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations. + """ + run_multi_root_graph_dump("GPU", "e2e_dump", "test_GPU_e2e_multi_root_graph_dump") + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_Ascend_e2e_multi_root_graph_dump(): + """ + Feature: + Multi root graph e2e dump for Ascend. + Description: + Test multi root graph e2e dump Ascend. + Expectation: + Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations. + """ + + run_multi_root_graph_dump("Ascend", "e2e_dump", "test_Ascend_e2e_multi_root_graph_dump") + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@security_off_wrap +def test_Ascend_async_multi_root_graph_dump(): + """ + Feature: + Multi root graph async dump for Ascend. + Description: + Test multi root graph async dump Ascend. + Expectation: + Dump for two different graphs, graph 0 even iterations and graph 1 odd iterations. + """ + + run_multi_root_graph_dump("Ascend", "async_dump", "test_Ascend_async_multi_root_graph_dump") diff --git a/tests/ut/python/debugger/gpu_tests/dump_test_utils.py b/tests/ut/python/debugger/gpu_tests/dump_test_utils.py index a6fb6c1582f..d27b8929a04 100644 --- a/tests/ut/python/debugger/gpu_tests/dump_test_utils.py +++ b/tests/ut/python/debugger/gpu_tests/dump_test_utils.py @@ -18,11 +18,14 @@ Utils for testing offline debugger. import os import tempfile +import bisect +import csv import numpy as np def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list): """Build dump file structure from tensor_list.""" + ranks_run_history = {} temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./") for x, _ in enumerate(tensor_info_list): slot = str(tensor_info_list[x].slot) @@ -30,6 +33,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li rank_id = str(tensor_info_list[x].rank_id) root_graph_id = str(tensor_info_list[x].root_graph_id) is_output = str(tensor_info_list[x].is_output) + if rank_id not in ranks_run_history: + graphs_run_history = {} + ranks_run_history[rank_id] = graphs_run_history + if root_graph_id not in ranks_run_history[rank_id]: + iter_list = [] + iter_list.append(iteration) + graphs_run_history[root_graph_id] = iter_list + elif iteration not in graphs_run_history[root_graph_id]: + bisect.insort(graphs_run_history[root_graph_id], iteration) + path = os.path.join(temp_dir, "rank_" + rank_id, net_name, root_graph_id, iteration) os.makedirs(path, exist_ok=True) if is_output == "True": @@ -40,4 +53,16 @@ def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_li ".DefaultFormat.npy", dir=path) full_path = file[1] np.save(full_path, tensor_list[x]) + build_global_execution_order(temp_dir, ranks_run_history) return temp_dir + + +def build_global_execution_order(path, ranks_run_history): + for rank_id in ranks_run_history.keys(): + exec_order_path = path + "/rank_" + rank_id + "/" + "execution_order" + os.makedirs(exec_order_path, exist_ok=True) + for graph in ranks_run_history[rank_id].keys(): + full_path = os.path.join(exec_order_path, "ms_global_execution_order_graph_" + graph + ".csv") + with open(full_path, 'w+', newline='') as csv_file: + write = csv.writer(csv_file) + write.writerows(ranks_run_history[rank_id][graph])