diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc index 3e4ab7bb3b6..3ff01b45b6d 100644 --- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc @@ -44,7 +44,8 @@ void CPUE2eDump::DumpCNodeData(const CNodePtr &node, uint32_t graph_id) { void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) { auto &json_parser = DumpJsonParser::GetInstance(); - if (!(json_parser.e2e_dump_enabled())) { + // avoid dumping same iteration over and over + if (!(json_parser.e2e_dump_enabled()) || json_parser.cur_dump_iter() == prev_run_iter_) { return; } std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/"; @@ -65,6 +66,7 @@ void CPUE2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) { fout << std::to_string(json_parser.cur_dump_iter()) + "\n"; fout.close(); ChangeFileMode(file_name, S_IRUSR); + prev_run_iter_ = json_parser.cur_dump_iter(); } void CPUE2eDump::DumpCNodeInputs(const CNodePtr &node, const std::string &dump_path) { diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h index e1078f8806c..b941b878030 100644 --- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h @@ -51,6 +51,8 @@ class CPUE2eDump { static void DumpInputImpl(const CNodePtr &node, const std::string &dump_path, std::string *kernel_name); static void DumpOutputImpl(const CNodePtr &node, const std::string &dump_path, std::string *kernel_name); + + inline static unsigned int prev_run_iter_ = UINT32_MAX; }; } // namespace mindspore #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_CPU_E_2_E_DUMP_H_ diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 90fe1404e09..540e8294582 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -292,6 +292,7 @@ void Debugger::Reset() { graph_proto_list_.clear(); graph_ptr_list_.clear(); graph_ptr_step_vec_.clear(); + executed_graph_ptr_set_.clear(); parameters_mindRT_.clear(); visited_root_graph_ids_.clear(); MS_LOG(INFO) << "Release Debugger resource."; @@ -502,8 +503,10 @@ void Debugger::DumpParamsAndConstAndHistory() { // Dump constant data for Ascend. DumpConstantDataAscend(graph); } + } + for (auto kernel_graph : executed_graph_ptr_set_) { // Dump graph run hisotry for each graph. - E2eDump::DumpRunIter(graph, GetRankID()); + E2eDump::DumpRunIter(kernel_graph, GetRankID()); } if (!cur_root_graph_checked) { visited_root_graph_ids_.push_back(cur_root_graph_id_); @@ -583,6 +586,7 @@ void Debugger::PostExecuteGraphDebugger() { debugger_->PostExecute(); } E2eDump::UpdateIterMindRTDump(); + executed_graph_ptr_set_.clear(); } /* diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index b6a06c161f4..3f709fa7502 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "backend/common/session/kernel_graph.h" #include "debug/debugger/grpc_client.h" #include "debug/debug_services.h" @@ -174,6 +175,8 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this { std::vector GetStepGraphPtrList() const { return graph_ptr_step_vec_; } + void InsertExecutedGraph(const KernelGraphPtr &graph_ptr) { executed_graph_ptr_set_.insert(graph_ptr); } + void SetGraphPtr(const KernelGraphPtr &graph_ptr) { graph_ptr_ = graph_ptr; } const KernelGraphPtr GetGraphPtr() const { return graph_ptr_; } @@ -317,8 +320,10 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this { std::list graph_proto_list_; std::list graph_ptr_list_; - // The vector of graph pointers that have been run in the current step. + // The vector of all the kernel graph pointers for the root graph that will execute in the current step. std::vector graph_ptr_step_vec_; + // The set of graph pointers that have been run in the current step. + std::set executed_graph_ptr_set_; // The vector of all the parameters for the current step for mindRT. std::vector parameters_mindRT_; std::vector visited_root_graph_ids_; diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc index c80644d3cfc..6de63d93e6c 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc @@ -67,6 +67,8 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in #ifdef ENABLE_DEBUGGER auto debugger = Debugger::GetInstance(); if (debugger != nullptr) { + auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); + debugger->InsertExecutedGraph(kernel_graph); std::string kernel_name = cnode->fullname_with_scope(); debugger->SetCurNode(kernel_name); bool read_data = CheckReadData(cnode); @@ -80,6 +82,8 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in #ifdef ENABLE_DEBUGGER auto debugger = Debugger::GetInstance(); if (debugger != nullptr) { + auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); + debugger->InsertExecutedGraph(kernel_graph); debugger->SetAscendKernelByKernelFlag(true); bool read_data = CheckReadData(cnode); if (read_data) { @@ -108,6 +112,10 @@ void DebugActor::DebugForGraph(const KernelGraphPtr &graph, const DeviceContext MS_EXCEPTION_IF_NULL(from_aid); MS_LOG(DEBUG) << "Super kernel debug for graph: " << graph->graph_id() << "."; #ifdef ENABLE_DEBUGGER + auto debugger = Debugger::GetInstance(); + if (debugger != nullptr) { + debugger->InsertExecutedGraph(graph); + } LoadDataForDebugger(graph); // This function updates graph history file and cur_dump_iter if dump is enabled. // When e2e dump is enabled, this function dumps the graph. diff --git a/tests/st/dump/test_multi_root_graph_dump.py b/tests/st/dump/test_multi_root_graph_dump.py index 64bb3cf0343..1ff93c068bb 100644 --- a/tests/st/dump/test_multi_root_graph_dump.py +++ b/tests/st/dump/test_multi_root_graph_dump.py @@ -108,6 +108,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name): assert len(os.listdir(execution_order_path)) == 8 check_graph_structure(dump_file_path, execution_order_path, '0', ['0', '2', '4']) check_graph_structure(dump_file_path, execution_order_path, '1', ['1', '3', '5']) + check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3']) + check_graph_structure(dump_file_path, execution_order_path, '3', ['5']) else: # In Ascend, we have 2 root graphs folders under rank_0 dir. # In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.