diff --git a/config/data_dump.json b/config/data_dump.json index 7eee5233e17..0ee8570ce09 100644 --- a/config/data_dump.json +++ b/config/data_dump.json @@ -3,17 +3,10 @@ "dump_mode": 0, "path": "/test", "net_name": "ResNet50", - "iteration": 0, + "iteration": "0", "input_output": 2, "kernels": ["Default/Conv-op12"], - "support_device": [0,1,2,3,4,5,6,7] - }, - "e2e_dump_settings": { - "enable": false, - "trans_flag": false - }, - "async_dump_settings": { - "enable": false, + "support_device": [0,1,2,3,4,5,6,7], "op_debug_mode": 0 } } \ No newline at end of file diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index a8a49aeb8c7..9fd695aa260 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -1198,6 +1198,7 @@ void AscendSession::Execute(const std::shared_ptr &kernel_graph, bo } auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); + DumpSetup(kernel_graph); bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink); Dump(kernel_graph); if (!ret_ok) { @@ -1209,6 +1210,13 @@ void AscendSession::Execute(const std::shared_ptr &kernel_graph, bo MS_LOG(INFO) << "Finish!"; } +void AscendSession::DumpSetup(const std::shared_ptr &kernel_graph) const { + MS_LOG(INFO) << "Start!"; + MS_EXCEPTION_IF_NULL(kernel_graph); + E2eDump::DumpSetup(kernel_graph.get(), device_id_); + MS_LOG(INFO) << "Finish!"; +} + void AscendSession::Dump(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); diff --git a/mindspore/ccsrc/backend/session/ascend_session.h b/mindspore/ccsrc/backend/session/ascend_session.h index 8ba3d4cbc1c..642d8f5d7fe 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.h +++ b/mindspore/ccsrc/backend/session/ascend_session.h @@ -89,6 +89,7 @@ class AscendSession : public SessionBasic { void Load(const std::shared_ptr &kernel_graph) const; void Execute(const std::shared_ptr &kernel_graph, bool is_task) const; void Dump(const std::shared_ptr &kernel_graph) const; + void DumpSetup(const std::shared_ptr &kernel_graph) const; void DumpAllGraphs(const std::vector &all_graphs); void LoadTensor(const std::shared_ptr &kernel_graph) const; // below functions are used for run op diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index b2649651648..de6c083323f 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -107,9 +107,8 @@ void DumpJsonParser::Parse() { std::string cfg = ss.str(); MS_LOG(INFO) << "Dump json:" << cfg; - ParseCommonDumpSetting(j); - ParseAsyncDumpSetting(j); ParseE2eDumpSetting(j); + ParseCommonDumpSetting(j); JudgeDumpEnabled(); } @@ -214,6 +213,14 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) { auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput); auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels); auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice); + auto op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode); + + // async_dump is enabled by default, if e2e dump is enabled it will override this + auto context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context); + if (context->get_param(MS_CTX_DEVICE_TARGET) == kAscendDevice) { + async_dump_enabled_ = true; + } ParseDumpMode(*dump_mode); ParseDumpPath(*path); @@ -222,34 +229,29 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) { ParseInputOutput(*input_output); ParseKernels(*kernels); ParseSupportDevice(*support_device); -} - -void DumpJsonParser::ParseAsyncDumpSetting(const nlohmann::json &content) { - // async dump setting is optional - auto async_dump_setting = content.find(kAsyncDumpSettings); - if (async_dump_setting == content.end()) { - MS_LOG(INFO) << "No async_dump_settings"; - return; - } - - auto async_dump_enable = CheckJsonKeyExist(*async_dump_setting, kEnable); - auto op_debug_mode = CheckJsonKeyExist(*async_dump_setting, kOpDebugMode); - - async_dump_enabled_ = ParseEnable(*async_dump_enable); ParseOpDebugMode(*op_debug_mode); } void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) { auto e2e_dump_setting = content.find(kE2eDumpSettings); + auto context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context); if (e2e_dump_setting == content.end()) { - MS_LOG(INFO) << "No e2e_dump_settings"; - return; + if (context->get_param(MS_CTX_DEVICE_TARGET) == kGPUDevice) { + MS_LOG(EXCEPTION) << "e2e_dump_settings needed for GPU dump"; + } else { + MS_LOG(INFO) << "No e2e_dump_settings"; + return; + } } auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable); auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag); e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable); + if (e2e_dump_enabled_ && context->get_param(MS_CTX_DEVICE_TARGET) == kAscendDevice) { + MS_LOG(WARNING) << "Deprecated: Synchronous dump mode is deprecated and will be removed in a future release"; + } trans_flag_ = ParseEnable(*trans_flag); } @@ -304,8 +306,68 @@ void DumpJsonParser::ParseNetName(const nlohmann::json &content) { } void DumpJsonParser::ParseIteration(const nlohmann::json &content) { - CheckJsonUnsignedType(content, kIteration); - iteration_ = content; + CheckJsonStringType(content, kIteration); + if (e2e_dump_enabled_) { + std::string temp_iter = content; + // is this a single iteration + if (temp_iter != "all" && temp_iter.find("-") == std::string::npos && temp_iter.find("|") == std::string::npos) { + iteration_ = std::stoul(temp_iter); + } else { + MS_LOG(EXCEPTION) << "Can only use a single value for the iteration in sync mode."; + } + } else if (async_dump_enabled_) { + async_iteration_ = content; + if (async_iteration_.empty()) { + MS_LOG(EXCEPTION) << "In async dump settings json file, iteration is empty"; + } + } else { + MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. "; + } +} + +bool DumpJsonParser::IsDumpIter(uint32_t iteration) { + // bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not. + if (async_iteration_ == "all") { + return true; + } + int start = 0; + int end = async_iteration_.find("|"); + while (end != -1) { + std::string temp = async_iteration_.substr(start, end - start); + int range_idx = temp.find("-"); + if (range_idx != -1) { + uint32_t low_range = std::stoul(temp.substr(0, range_idx)); + uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1)); + if ((low_range <= iteration) && (iteration <= high_range)) { + return true; + } + } else if (iteration == std::stoul(temp)) { + return true; + } + start = end + 1; + end = async_iteration_.find("|", start); + } + std::string temp = async_iteration_.substr(start, end - start); + int range_idx = temp.find("-"); + if (range_idx != -1) { + uint32_t low_range = std::stoul(temp.substr(0, range_idx)); + uint32_t high_range = std::stoul(temp.substr((range_idx + 1), -1)); + if ((low_range <= iteration) && (iteration <= high_range)) { + return true; + } + } else if (iteration == std::stoul(temp)) { + return true; + } + return false; +} + +bool DumpJsonParser::IsSingleIter() { + // bool DumpJsonParser::IsSingleIter() --> checks if iteration in json dump file is single or not. + if (async_iteration_ != "all" && async_iteration_.find("-") == std::string::npos && + async_iteration_.find("|") == std::string::npos) { + return true; + } + return false; } void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) { diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h index f296697f6ed..bba500f497d 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h @@ -41,6 +41,9 @@ class DumpJsonParser { bool NeedDump(const std::string &op_full_name) const; void MatchKernel(const std::string &kernel_name); void PrintUnusedKernel(); + bool IsDumpIter(uint32_t iteration); + bool DumpAllIter(); + bool IsSingleIter(); bool async_dump_enabled() const { return async_dump_enabled_; } bool e2e_dump_enabled() const { return e2e_dump_enabled_; } @@ -58,6 +61,7 @@ class DumpJsonParser { bool OutputNeedDump() const; std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const; void UpdateNeedDumpKernels(NotNull kernel_graph); + bool AsyncDumpEnabled() const { return async_dump_enabled_; } private: DumpJsonParser() = default; @@ -71,6 +75,7 @@ class DumpJsonParser { std::string path_; std::string net_name_; uint32_t iteration_{0}; + std::string async_iteration_; uint32_t input_output_{0}; std::map kernels_; std::set support_devices_; @@ -80,7 +85,6 @@ class DumpJsonParser { bool already_parsed_{false}; void ParseCommonDumpSetting(const nlohmann::json &content); - void ParseAsyncDumpSetting(const nlohmann::json &content); void ParseE2eDumpSetting(const nlohmann::json &content); bool IsDumpEnabled(); diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index c961c3d953f..e3be8c564b1 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -22,6 +22,7 @@ #include "debug/data_dump/dump_json_parser.h" #include "common/trans.h" +#include "debug/common.h" #include "backend/session/anf_runtime_algorithm.h" #include "utils/ms_context.h" #include "runtime/device/kernel_runtime_manager.h" @@ -235,6 +236,38 @@ void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const st } } +void E2eDump::DumpSetup(const session::KernelGraph *graph, uint32_t device_id) { + auto &dump_json_parser = DumpJsonParser::GetInstance(); + uint32_t cur_iter = dump_json_parser.cur_dump_iter(); + if (dump_json_parser.AsyncDumpEnabled() && dump_json_parser.IsDumpIter(cur_iter)) { + auto zero_dir_dump_path = + dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; + + auto root_cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" + + dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()); + + auto cur_iter_dump_path = root_cur_iter_dump_path + "/" + std::to_string(cur_iter); + + MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path; + MS_LOG(INFO) << "root_cur_iter_dump_path: " << root_cur_iter_dump_path; + MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path; + + // create cur_iter_dump_path dirs + bool status = Common::CreateNotExistDirs(root_cur_iter_dump_path); + if (!status) { + MS_LOG(EXCEPTION) << "Failed at CreateNotExistDirs for " << root_cur_iter_dump_path; + return; + } + + // create symlink to active dump dir for the iteration in final dump dir + std::string command = "ln -fs " + zero_dir_dump_path + " " + cur_iter_dump_path; + MS_LOG(INFO) << "ln command: " << command; + if (system(command.c_str())) { + MS_LOG(EXCEPTION) << "failed to create symlink to active dump dir for the iteration in final dump dir."; + } + } +} + bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); auto &dump_json_parser = DumpJsonParser::GetInstance(); @@ -245,16 +278,60 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t device_id, co if (starting_graph_id == graph_id) { dump_json_parser.UpdateDumpIter(); } - if (!dump_json_parser.GetIterDumpFlag()) { + + if (dump_json_parser.GetIterDumpFlag()) { + MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter(); + MS_LOG(INFO) << "Current graph id is " << graph_id; + std::string dump_path = GenerateDumpPath(graph_id, &device_id); + + DumpInput(graph, dump_path, debugger); + DumpOutput(graph, dump_path, debugger); + DumpParametersAndConst(graph, dump_path, debugger); + return true; + } else if (dump_json_parser.AsyncDumpEnabled()) { + uint32_t prev_dump_iter = dump_json_parser.cur_dump_iter() - 1; + + auto zero_dir_dump_path = + dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/_/" + std::to_string(graph->graph_id()) + "/0"; + + auto cur_iter_dump_path = dump_json_parser.path() + "/rank_" + std::to_string(device_id) + "/" + + dump_json_parser.net_name() + "/" + std::to_string(graph->graph_id()) + "/" + + std::to_string(prev_dump_iter); + + MS_LOG(INFO) << "zero_dir_dump_path: " << zero_dir_dump_path; + MS_LOG(INFO) << "cur_iter_dump_path: " << cur_iter_dump_path; + + if (dump_json_parser.IsDumpIter(prev_dump_iter)) { + // remove symlink to active dump dir + std::string command = "rm -f " + cur_iter_dump_path; + MS_LOG(INFO) << "rm command: " << command; + if (system(command.c_str())) { + MS_LOG(EXCEPTION) << "failed to remove symlink to active dump dir."; + } + + // create actual dir for iteration in final dump dir + bool status = Common::CreateNotExistDirs(cur_iter_dump_path); + if (!status) { + MS_LOG(EXCEPTION) << "failed at CreateNotExistDirs for " << cur_iter_dump_path; + } + + // move contents from active dump dir to final dump dir + command = "mv " + zero_dir_dump_path + "/* " + cur_iter_dump_path + "/."; + MS_LOG(INFO) << "mv command: " << command; + if (system(command.c_str())) { + MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!"; + } + } else { + // delete contents from active dump dir + std::string command = "rm -f " + zero_dir_dump_path + "/*"; + MS_LOG(INFO) << "rm command: " << command; + if (system(command.c_str())) { + MS_LOG(EXCEPTION) << "Ascend runtime has changed the dump dir structure!!!"; + } + } + return true; } - MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter(); - MS_LOG(INFO) << "Current graph id is " << graph_id; - std::string dump_path = GenerateDumpPath(graph_id, &device_id); - - DumpInput(graph, dump_path, debugger); - DumpOutput(graph, dump_path, debugger); - DumpParametersAndConst(graph, dump_path, debugger); - return true; + return false; } } // namespace mindspore diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index d1269c0c29f..74ed788af1c 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -17,6 +17,7 @@ #ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_ #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_ +#include #include #include @@ -33,6 +34,7 @@ class E2eDump { public: E2eDump() = default; ~E2eDump() = default; + static void DumpSetup(const session::KernelGraph *graph, uint32_t device_id); static bool DumpData(const session::KernelGraph *graph, uint32_t device_id, const Debugger *debugger = nullptr); // Dump data when task error. static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc index cb8f453d3b0..f4404cd7d60 100644 --- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc +++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc @@ -141,8 +141,8 @@ void DataDumper::SetOpMappingInfo(NotNull dump_inf auto device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); dump_info->set_dump_path("/" + dump_path + "/rank_" + std::to_string(device_id) + "/"); MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path; - dump_info->set_model_name(DumpJsonParser::GetInstance().net_name()); - dump_info->set_dump_step(std::to_string(DumpJsonParser::GetInstance().iteration())); + dump_info->set_model_name("_"); + dump_info->set_dump_step("0"); dump_info->set_model_id(graph_id); dump_info->set_flag(kAicpuLoadFlag); diff --git a/tests/st/dump/async_dump.json b/tests/st/dump/async_dump.json index 3dd418f1eaf..e629ef2564a 100644 --- a/tests/st/dump/async_dump.json +++ b/tests/st/dump/async_dump.json @@ -3,13 +3,10 @@ "dump_mode": 0, "path": "/test", "net_name": "Net", - "iteration": 0, + "iteration": "0", "input_output": 2, "kernels": ["Default/TensorAdd-op3"], - "support_device": [0,1,2,3,4,5,6,7] - }, - "async_dump_settings": { - "enable": true, + "support_device": [0,1,2,3,4,5,6,7], "op_debug_mode": 0 } } \ No newline at end of file diff --git a/tests/st/dump/e2e_dump.json b/tests/st/dump/e2e_dump.json index 39b8277d776..73a0b6c96de 100644 --- a/tests/st/dump/e2e_dump.json +++ b/tests/st/dump/e2e_dump.json @@ -3,10 +3,11 @@ "dump_mode": 0, "path": "/test", "net_name": "Net", - "iteration": 0, + "iteration": "0", "input_output": 0, "kernels": ["Default/Conv-op12"], - "support_device": [0,1,2,3,4,5,6,7] + "support_device": [0,1,2,3,4,5,6,7], + "op_debug_mode": 0 }, "e2e_dump_settings": { "enable": true, diff --git a/tests/st/dump/test_async_dump_net_multi_layer_mode1.json b/tests/st/dump/test_async_dump_net_multi_layer_mode1.json index 41b392adec9..6ce51fc1a48 100644 --- a/tests/st/dump/test_async_dump_net_multi_layer_mode1.json +++ b/tests/st/dump/test_async_dump_net_multi_layer_mode1.json @@ -3,7 +3,7 @@ "dump_mode": 0, "path": "/tmp/async_dump/test_async_dump_net_multi_layer_mode1", "net_name": "test", - "iteration": 0, + "iteration": "0", "input_output": 2, "kernels": [ "default/TensorAdd-op10", @@ -12,14 +12,7 @@ "Default/optimizer-Momentum/tuple_getitem-op29", "Default/optimizer-Momentum/ApplyMomentum-op12" ], - "support_device": [0,1,2,3,4,5,6,7] - }, - "async_dump_settings": { - "enable": true, + "support_device": [0,1,2,3,4,5,6,7], "op_debug_mode": 0 - }, - "e2e_dump_settings": { - "enable": false, - "trans_flag": false } } \ No newline at end of file