From 1480c93d044a197a64a6eac6b812dd0685fcf370 Mon Sep 17 00:00:00 2001 From: caifubi Date: Mon, 14 Sep 2020 18:47:30 +0800 Subject: [PATCH] Data Dump Bug Fix 1. Remove old e2e dump json 2. Remove warning log 3. Add device id to e2e dump path 4. Fix bug of dump json parse failed after hccl_init --- config/data_dump.json | 2 +- config/e2e_dump_config.json | 22 ------------------- config/e2e_dump_config_0.json | 22 ------------------- config/e2e_dump_config_1.json | 22 ------------------- .../ccsrc/backend/session/ascend_session.cc | 1 + .../ccsrc/debug/data_dump/dump_json_parser.cc | 22 ++++++++----------- .../ccsrc/debug/data_dump/dump_json_parser.h | 1 + .../ccsrc/debug/data_dump/e2e_dump_util.cc | 11 ++++++---- .../device/ascend/ascend_kernel_runtime.cc | 5 +---- 9 files changed, 20 insertions(+), 88 deletions(-) delete mode 100644 config/e2e_dump_config.json delete mode 100644 config/e2e_dump_config_0.json delete mode 100644 config/e2e_dump_config_1.json diff --git a/config/data_dump.json b/config/data_dump.json index 3df50432fc3..7eee5233e17 100644 --- a/config/data_dump.json +++ b/config/data_dump.json @@ -4,7 +4,7 @@ "path": "/test", "net_name": "ResNet50", "iteration": 0, - "input_output": 0, + "input_output": 2, "kernels": ["Default/Conv-op12"], "support_device": [0,1,2,3,4,5,6,7] }, diff --git a/config/e2e_dump_config.json b/config/e2e_dump_config.json deleted file mode 100644 index fdba941f971..00000000000 --- a/config/e2e_dump_config.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "DumpSettings": { - "enable": false, - "trans_flag": false, - "path": "/tmp/net/", - "net_name": "ResNet50", - "mode": 0, - "iteration": 0, - "kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"] - }, - - "DumpSettingsSpec": { - "enable": "true: dump enable, false: dump disable", - "trans_flag": "true: trans to host format, false: not trans format", - "path": "the dump file folder", - "net_name": "net name eg:ResNet50", - "mode": "0: dump all kernels, 1: dump kernels in kernels list", - "iteration": "0: all iteration, others: specified iteration ", - "kernels": "op's full scope name which need to be dump" - }, - "other": {} -} \ No newline at end of file diff --git a/config/e2e_dump_config_0.json b/config/e2e_dump_config_0.json deleted file mode 100644 index 64b18b8b553..00000000000 --- a/config/e2e_dump_config_0.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "DumpSettings": { - "enable": false, - "trans_flag": false, - "path": "/tmp/hccllog/0", - "net_name": "ResNet50", - "mode": 0, - "iteration": 0, - "kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"] - }, - - "DumpSettingsSpec": { - "enable": "true: dump enable, false: dump disable", - "trans_flag": "true: trans to host format, false: not trans format", - "path": "the dump file folder", - "net_name": "net name eg:ResNet50", - "mode": "0: dump all kernels, 1: dump kernels in kernels list", - "iteration": "0: all iteration, others: specified iteration ", - "kernels": "op's full scope name which need to be dump" - }, - "other": {} -} \ No newline at end of file diff --git a/config/e2e_dump_config_1.json b/config/e2e_dump_config_1.json deleted file mode 100644 index 14864877996..00000000000 --- a/config/e2e_dump_config_1.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "DumpSettings": { - "enable": false, - "trans_flag": false, - "path": "/tmp/hccllog/1", - "net_name": "ResNet50", - "mode": 0, - "iteration": 0, - "kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"] - }, - - "DumpSettingsSpec": { - "enable": "true: dump enable, false: dump disable", - "trans_flag": "true: trans to host format, false: not trans format", - "path": "the dump file folder", - "net_name": "net name eg:ResNet50", - "mode": "0: dump all kernels, 1: dump kernels in kernels list", - "iteration": "0: all iteration, others: specified iteration ", - "kernels": "op's full scope name which need to be dump" - }, - "other": {} -} \ No newline at end of file diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 6fd83824a67..ede3390455e 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -442,6 +442,7 @@ void AscendSession::InitRuntimeResource() { if (!runtime_instance->Init()) { MS_LOG(EXCEPTION) << "Kernel runtime init error."; } + DumpJsonParser::GetInstance().Parse(); MS_LOG(INFO) << "Finish!"; } diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index a6658f234e9..ff9c63941dd 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -74,6 +74,10 @@ bool DumpJsonParser::IsDumpEnabled() { void DumpJsonParser::Parse() { std::lock_guard guard(lock_); + if (already_parsed_) { + return; + } + already_parsed_ = true; if (!IsDumpEnabled()) { return; } @@ -305,6 +309,8 @@ void DumpJsonParser::JudgeDumpEnabled() { MS_LOG(WARNING) << "Dump not enabled. device_id:" << device_id << " not support"; } context->set_param(MS_CTX_ENABLE_MEM_REUSE, !e2e_dump_enabled_); + MS_LOG(INFO) << "Dump status, e2e_dump_enabled:" << e2e_dump_enabled_ + << " async_dump_enabled:" << async_dump_enabled_; } bool DumpJsonParser::NeedDump(const std::string &op_full_name) const { @@ -325,6 +331,9 @@ void DumpJsonParser::MatchKernel(const std::string &kernel_name) { } void DumpJsonParser::PrintUnusedKernel() { + if (!e2e_dump_enabled_ && !async_dump_enabled_) { + return; + } for (const auto &iter : kernels_) { if (iter.second == 0) { MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first; @@ -362,16 +371,6 @@ bool DumpJsonParser::OutputNeedDump() const { return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly; } -bool NeedAsyncDump(const CNodePtr &kernel) { - if (AnfAlgo::GetKernelType(kernel) != TBE_KERNEL && AnfAlgo::GetKernelType(kernel) != AICPU_KERNEL && - AnfAlgo::GetKernelType(kernel) != AKG_KERNEL) { - return false; - } - MS_EXCEPTION_IF_NULL(kernel); - // dump all kernel if mode is set 0 in data_dump.json - return DumpJsonParser::GetInstance().NeedDump(kernel->fullname_with_scope()); -} - void DumpJsonParser::UpdateNeedDumpKernels(NotNull kernel_graph) { if (e2e_dump_enabled_) { MS_LOG(INFO) << "E2e dump no need to update dump kernel list"; @@ -391,9 +390,6 @@ void DumpJsonParser::UpdateNeedDumpKernels(NotNull update_kernels.try_emplace(input->fullname_with_scope(), 0); } } - } else if (NeedAsyncDump(kernel)) { - MS_LOG(INFO) << "[AsyncDump] Match Node:" << kernel->fullname_with_scope(); - update_kernels.try_emplace(kernel->fullname_with_scope(), 0); } } kernels_.insert(update_kernels.begin(), update_kernels.end()); diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h index b0576abb8df..8dcfbd57232 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h @@ -72,6 +72,7 @@ class DumpJsonParser { uint32_t op_debug_mode_{0}; bool trans_flag_{false}; uint32_t cur_dump_iter_{0}; + bool already_parsed_{false}; void ParseCommonDumpSetting(const nlohmann::json &content); void ParseAsyncDumpSetting(const nlohmann::json &content); diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc index b250baa6e44..4bde9dab4e9 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump_util.cc @@ -206,14 +206,17 @@ bool E2eDumpUtil::DumpData(const session::KernelGraph *graph, Debugger *debugger } } MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter(); + auto context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context); + auto device_id = context->get_param(MS_CTX_DEVICE_ID); + std::string net_name = dump_json_parser.net_name(); std::string iterator = std::to_string(dump_json_parser.cur_dump_iter()); std::string dump_path = dump_json_parser.path(); - if (dump_path.back() == '/') { - dump_path = dump_path + net_name + '/' + iterator; - } else { - dump_path = dump_path + '/' + net_name + '/' + iterator; + if (dump_path.back() != '/') { + dump_path += "/"; } + dump_path += (net_name + "/device_" + std::to_string(device_id) + "/iteration_" + iterator); DumpInput(graph, dump_path, debugger); DumpOutput(graph, dump_path, debugger); DumpParameters(graph, dump_path, debugger); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 1a2ea749dbe..5980d3ee6c9 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -206,11 +206,8 @@ bool AscendKernelRuntime::Init() { SetContext(); return true; } - bool ret = false; - - DumpJsonParser::GetInstance().Parse(); // Start up profiling before rtSetDevice - ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); + bool ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); if (!ret) { MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed."; }