From 67bfdcab093c27445b0171e63cb891ae37677e0b Mon Sep 17 00:00:00 2001 From: maoyaomin Date: Sat, 4 Mar 2023 17:46:28 +0800 Subject: [PATCH] add pynative mode operator overflow check for dump --- .../ccsrc/debug/data_dump/dump_json_parser.cc | 3 -- mindspore/ccsrc/debug/data_dump/e2e_dump.cc | 8 +--- mindspore/ccsrc/debug/debugger/debugger.cc | 7 +++- .../ccsrc/debug/debugger/debugger_utils.cc | 2 +- .../ascend/hal/device/dump/kernel_dumper.cc | 39 ++++++++++++------- .../ascend/hal/device/dump/kernel_dumper.h | 3 +- .../hal/hardware/ascend_kernel_executor.cc | 15 ++++++- .../graph_scheduler/actor/debug_actor.cc | 31 ++------------- .../graph_scheduler/actor/debug_actor.h | 3 +- tests/st/dump/test_multi_root_graph_dump.py | 6 +-- 10 files changed, 55 insertions(+), 62 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc index b67de7d2447..99c5388a6b4 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc @@ -88,9 +88,6 @@ bool DumpJsonParser::IsDumpEnabled() { auto context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context); - if (context->get_param(MS_CTX_EXECUTION_MODE) == kPynativeMode) { - MS_LOG(EXCEPTION) << "Dump is disabled in PyNative mode. Please set mode to GRAPH_MODE in context."; - } return true; } diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index 8dcb6af2701..050bdea5f23 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -555,12 +555,8 @@ void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) { return; } std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/"; - std::string graph_str; - if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) { - graph_str = std::to_string(graph->graph_id()); - } else { - graph_str = IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id()); - } + std::string graph_str = + IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id()); std::string file_name_to_check = execution_order_path + "/ms_global_execution_order_graph_" + graph_str + ".csv"; auto real_path = Common::CreatePrefixPath(file_name_to_check); if (!real_path.has_value()) { diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 8e920826e43..df91acf1ac0 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -513,7 +513,12 @@ void Debugger::DumpParamsAndConstAndHistory() { for (auto kernel_graph = executed_graph_ptr_set_.cbegin(); kernel_graph != executed_graph_ptr_set_.cend(); ++kernel_graph) { // Dump graph run hisotry for each graph. - E2eDump::DumpRunIter(*kernel_graph, GetRankID()); + if (Debugger::GetInstance()->GetAscendKernelByKernelFlag() && + (*kernel_graph)->graph_id() != (*kernel_graph)->root_graph_id()) { + MS_LOG(INFO) << "current graph graph_id = " << (*kernel_graph)->graph_id() << " is not root graph."; + } else { + E2eDump::DumpRunIter(*kernel_graph, GetRankID()); + } } if (!cur_root_graph_checked) { visited_root_graph_ids_.push_back(cur_root_graph_id_); diff --git a/mindspore/ccsrc/debug/debugger/debugger_utils.cc b/mindspore/ccsrc/debug/debugger/debugger_utils.cc index d42923d68bc..8273dff40a4 100644 --- a/mindspore/ccsrc/debug/debugger/debugger_utils.cc +++ b/mindspore/ccsrc/debug/debugger/debugger_utils.cc @@ -215,7 +215,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, MS_EXCEPTION_IF_NULL(kernel_graph); auto graph_id = kernel_graph->graph_id(); // for GPU, nodes are dumped in graph_id directory. - if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) { + if (IsDeviceTargetGPU()) { debugger->DumpSingleNode(cnode, graph_id); } else { // for Ascend, node are dumped in root_graph_id directory. diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc index 84fd30e913f..e263e0e5215 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc @@ -16,6 +16,7 @@ #include "plugin/device/ascend/hal/device/dump/kernel_dumper.h" #include +#include #ifndef ENABLE_SECURITY #include "debug/data_dump/dump_json_parser.h" #endif @@ -44,7 +45,7 @@ static constexpr uint64_t kOpDebugMemorySize = 2048; const size_t kDebugP2pSize = 8UL; } // namespace DUMPER_REG(kAscendDevice, KernelDumper); -std::mutex KernelDumper::debug_register_mutex_; +std::mutex KernelDumper::dumper_mutex_; std::map> KernelDumper::op_debug_tasks; std::map KernelDumper::is_data_map; std::map KernelDumper::stream_task_graphs; @@ -80,9 +81,17 @@ KernelDumper::~KernelDumper() { } void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) { - std::lock_guard lock(debug_register_mutex_); - aicpu::dump::OpMappingInfo dump_info; - SetOpMappingInfo(NOT_NULL(&dump_info), kernel); + auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel)); + if (stream == nullptr) { + stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex); + } + if (DumpJsonParser::GetInstance().op_debug_mode() > 0) { + auto rt_ret = rtStreamSynchronize(stream); + dumper_mutex_.unlock(); + if (rt_ret != ACL_ERROR_RT_AICORE_OVER_FLOW) { + return; + } + } if (!KernelNeedDump(kernel)) { return; @@ -91,10 +100,9 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) { MS_LOG(WARNING) << "[KernelDumper] kernel [" << kernel->UniqueName() << "] is a non-task node, skip dump."; return; } - auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel)); - if (stream == nullptr) { - stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex); - } + aicpu::dump::OpMappingInfo dump_info; + SetOpMappingInfo(NOT_NULL(&dump_info), kernel); + DumpJsonParser::GetInstance().MatchKernel(kernel->fullname_with_scope()); aicpu::dump::Task task; ConstructDumpTask(NOT_NULL(kernel), NOT_NULL(&task)); @@ -105,7 +113,7 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) { graph_id_ = AnfAlgo::GetGraphId(kernel.get()); std::string stream_task_id = std::to_string(stream_id_) + std::to_string(task_id_); KernelDumper::stream_task_graphs.emplace(stream_task_id, kernel->fullname_with_scope()); - MS_LOG(INFO) << "[DataDump] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_ + MS_LOG(INFO) << "[KernelDumper] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_ << " task_id:" << task_id_ << " fullname:" << kernel->fullname_with_scope(); } @@ -114,12 +122,12 @@ void KernelDumper::SetOpMappingInfo(NotNull dump_i dump_info->set_dump_path(dump_path_); dump_info->set_model_name(net_name_); dump_info->set_dump_step(iteration_); - auto graph_id = AnfAlgo::GetGraphId(kernel.get()); - dump_info->set_model_id(graph_id); - dump_info->set_flag(kAicpuLoadFlag); - FuncGraphPtr f_graph = kernel->func_graph(); auto kernel_graph_ = f_graph->cast(); + auto root_graph_id = kernel_graph_->root_graph_id(); + dump_info->set_model_id(root_graph_id); + dump_info->set_flag(kAicpuLoadFlag); + auto input_ctrl_tensors = kernel_graph_->device_loop_control_tensors(); if (input_ctrl_tensors.size() > 0) { auto kCurLoopCountName = "current_loop_count"; @@ -225,7 +233,6 @@ void KernelDumper::ExecutorDumpOp(const aicpu::dump::OpMappingInfo &op_mapping_i MS_LOG(ERROR) << "[KernelDumper] Call rt api rtCpuKernelLaunch Failed, rt_ret = " << rt_ret; return; } - rtStreamSynchronize(stream_); } void KernelDumper::ConstructDumpTask(NotNull kernel, NotNull dump_task) { @@ -375,7 +382,6 @@ void KernelDumper::MallocP2PDebugMem(const void *const op_debug_addr) { } void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) { - std::lock_guard lock(register_mutex_); uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode(); auto iter = kOverflowModeStr.find(op_debug_mode); if (iter == kOverflowModeStr.end()) { @@ -384,6 +390,7 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) { if (op_debug_mode == kNoOverflow) { return; } + dumper_mutex_.lock(); auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel)); if (stream == nullptr) { stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex); @@ -391,6 +398,8 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) { if (KernelDumper::op_debug_tasks.find(stream) != KernelDumper::op_debug_tasks.end()) { return; } else { + std::string stream_id = std::to_string(AnfAlgo::GetStreamId(kernel)); + KernelDumper::stream_task_graphs.emplace(stream_id, "KernelDumper"); auto graph_id = AnfAlgo::GetGraphId(kernel.get()); if (KernelDumper::is_data_map.find(graph_id) != KernelDumper::is_data_map.end()) { return; diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h index db504607c03..a64550951b0 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h @@ -72,6 +72,7 @@ class KernelDumper : public debug::OverflowDumper { static std::map> op_debug_tasks; static std::map is_data_map; static std::map stream_task_graphs; + static std::mutex dumper_mutex_; string dump_path_; string net_name_; @@ -79,7 +80,6 @@ class KernelDumper : public debug::OverflowDumper { private: // Support multi-thread. - static std::mutex debug_register_mutex_; bool load_flag_; uint32_t graph_id_; uint32_t task_id_{0U}; @@ -91,7 +91,6 @@ class KernelDumper : public debug::OverflowDumper { void *dev_load_mem_ = nullptr; void *proto_dev_mem_ = nullptr; void *proto_size_dev_mem_ = nullptr; - std::mutex register_mutex_; std::string overflow_dump_filename = "debug_files"; void *p2p_debug_addr_ = nullptr; void SetOpMappingInfo(NotNull dump_info, const CNodePtr &kernel); diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc index 7954081c282..b8c3778d166 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc @@ -43,6 +43,7 @@ #include "plugin/device/ascend/hal/profiler/ascend_profiling.h" #include "plugin/device/ascend/hal/device/profiling/profiling_manager.h" #include "plugin/device/ascend/hal/device/dump/ascend_dump.h" +#include "debug/data_dump/overflow_dumper.h" using Adx::AdxRegDumpProcessCallBack; using mindspore::device::ascend::ProfilingManager; @@ -370,7 +371,13 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vectorInit(); + register_dumper->OpDebugRegisterForStream(kernel); + } +#endif bool is_dynamic_shape = common::AnfAlgo::IsDynamicShape(kernel); if (!is_dynamic_shape || !(common::AnfAlgo::GetBooleanAttr(kernel, kAttrMSFunction))) { auto iter = node_atomics_persistent_cache_.find(kernel); @@ -399,6 +406,12 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vectorOpLoadDumpInfo(kernel); + } +#endif #ifndef ENABLE_SECURITY auto ascend_instance = profiler::ascend::AscendProfiler::GetInstance(); MS_EXCEPTION_IF_NULL(ascend_instance); diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc index 54f29f81ed2..28bb55d3f28 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc @@ -24,7 +24,6 @@ #ifndef ENABLE_SECURITY #include "debug/data_dump/cpu_e2e_dump.h" #include "debug/data_dump/e2e_dump.h" -#include "debug/data_dump/overflow_dumper.h" #include "utils/ms_context.h" #endif #ifdef ENABLE_DEBUGGER @@ -80,18 +79,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in #endif } else if (device_context->GetDeviceType() == device::DeviceType::kAscend) { #ifdef ENABLE_DEBUGGER -#ifndef ENABLE_SECURITY - auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); - graph_id_sets_.insert(kernel_graph->graph_id()); - if (DumpJsonParser::GetInstance().async_dump_enabled()) { - auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice); - kernel_dumper->Init(); - kernel_dumper->OpDebugRegisterForStream(cnode); - kernel_dumper->OpLoadDumpInfo(cnode); - } -#endif auto debugger = Debugger::GetInstance(); if (debugger != nullptr) { + auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); debugger->InsertExecutedGraph(kernel_graph); debugger->SetAscendKernelByKernelFlag(true); bool read_data = CheckReadData(cnode); @@ -186,7 +176,7 @@ void DebugActor::DebugOnStepBegin(const std::vector &graphs, return kernel->fullname_with_scope().find("InitDataSetQueue") != std::string::npos; }); } - if (!is_data_map_ && !graphs[0]->is_graph_run_mode()) { + if (!is_data_map_) { auto kCurLoopCountName = "current_loop_count"; for (size_t i = 0; i < graphs.size(); i++) { const auto &graph_ = graphs[i]; @@ -200,7 +190,7 @@ void DebugActor::DebugOnStepBegin(const std::vector &graphs, } auto tensor = device_loop_control_tensors.at(kCurLoopCountName); MS_EXCEPTION_IF_NULL(tensor); - auto *cur_val = static_cast(tensor->data_c()); + auto *cur_val = static_cast(tensor->data_c()); MS_EXCEPTION_IF_NULL(cur_val); *cur_val = current_step; tensor->set_sync_status(kNeedSyncHostToDevice); @@ -236,21 +226,6 @@ void DebugActor::DebugOnStepEnd(OpContext *const op_context, const } #endif -#ifdef ENABLE_DEBUGGER -#ifndef ENABLE_SECURITY - if (DumpJsonParser::GetInstance().async_dump_enabled() && DumpJsonParser::GetInstance().op_debug_mode() > 0 && - Debugger::GetInstance()->GetAscendKernelByKernelFlag()) { - uint32_t rank_id = Debugger::GetRankID(); - std::set::iterator graph_id_iter; - for (graph_id_iter = graph_id_sets_.begin(); graph_id_iter != graph_id_sets_.end(); ++graph_id_iter) { - auto graph_id = *graph_id_iter; - DeleteNoOverflowFile(rank_id, graph_id); - } - graph_id_sets_.clear(); - } -#endif -#endif - #ifdef ENABLE_DEBUGGER auto debugger = Debugger::GetInstance(); if (debugger != nullptr) { diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h index 71fa4e09652..c9ed8836bbb 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h @@ -55,7 +55,7 @@ class DebugActor : public ActorBase { // The debug on step end. void DebugOnStepEnd(OpContext *const op_context, const AID *from_aid); - static inline uint32_t current_step{0}; + static inline uint64_t current_step{0}; private: // class members @@ -63,7 +63,6 @@ class DebugActor : public ActorBase { // Support multi-thread. std::mutex debug_mutex_; - std::set graph_id_sets_; }; } // namespace runtime diff --git a/tests/st/dump/test_multi_root_graph_dump.py b/tests/st/dump/test_multi_root_graph_dump.py index 81e77de6f60..805c627ce6a 100644 --- a/tests/st/dump/test_multi_root_graph_dump.py +++ b/tests/st/dump/test_multi_root_graph_dump.py @@ -101,8 +101,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name): time.sleep(2) execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order') # Multi root graph script: check dump data dir and graph history files and see if iteration number is matched. - if device == "GPU" or os.environ.get('GRAPH_OP_RUN') == "1": - # In GPU or KernelByKernel, we have 4 kernel graphs folders under rank_0 dir. + if device == "GPU": + # In GPU, we have 4 kernel graphs folders under rank_0 dir. # In graph history dir, there are 2 files for each graph (ms_execution_order and ms_global_execution_order). assert len(os.listdir(dump_file_path)) == 4 assert len(os.listdir(execution_order_path)) == 8 @@ -111,7 +111,7 @@ def run_multi_root_graph_dump(device, dump_mode, test_name): check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3']) check_graph_structure(dump_file_path, execution_order_path, '3', ['5']) else: - # In Ascend Super Kernel, we have 2 root graphs folders under rank_0 dir. + # In Ascend, we have 2 root graphs folders under rank_0 dir. # In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files. # Each graph should have 3 iterations. Each graph was executed once per epoch. # Graph 0 was executed in even iterations, graph 1 was executed in odd iterations.