From e2a4172560db6bfa97b1c825a9d5aefa836d6307 Mon Sep 17 00:00:00 2001 From: Parastoo Ashtari Date: Sun, 27 Jun 2021 18:24:54 -0400 Subject: [PATCH] make gpu dump truly async --- mindspore/ccsrc/debug/data_dump/e2e_dump.cc | 56 +++++++++++++ mindspore/ccsrc/debug/data_dump/e2e_dump.h | 11 +++ mindspore/ccsrc/debug/debugger/debugger.cc | 81 +++++++++++------- mindspore/ccsrc/debug/debugger/debugger.h | 8 +- .../runtime/framework/actor/debug_actor.cc | 83 +++++++++++++------ .../ccsrc/runtime/framework/graph_compiler.cc | 7 +- mindspore/ccsrc/vm/backend.cc | 6 -- 7 files changed, 189 insertions(+), 63 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc index acab5d2afb6..97703fae0db 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc @@ -76,6 +76,21 @@ void E2eDump::DumpOutput(const session::KernelGraph *graph, const std::string &d } } +void E2eDump::DumpOutputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) { + auto &dump_json_parser = DumpJsonParser::GetInstance(); + if (!dump_json_parser.OutputNeedDump()) { + return; + } + bool trans_flag = dump_json_parser.trans_flag(); + MS_EXCEPTION_IF_NULL(node); + std::string kernel_name = node->fullname_with_scope(); + if (!dump_json_parser.NeedDump(kernel_name)) { + return; + } + DumpJsonParser::GetInstance().MatchKernel(kernel_name); + DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger); +} + void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, std::string *kernel_name, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(node); @@ -128,6 +143,21 @@ void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &du } } +void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) { + auto &dump_json_parser = DumpJsonParser::GetInstance(); + if (!dump_json_parser.InputNeedDump()) { + return; + } + bool trans_flag = dump_json_parser.trans_flag(); + MS_EXCEPTION_IF_NULL(node); + std::string kernel_name = node->fullname_with_scope(); + if (!dump_json_parser.NeedDump(kernel_name)) { + return; + } + DumpJsonParser::GetInstance().MatchKernel(kernel_name); + DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger); +} + void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, std::string *kernel_name, const Debugger *debugger) { MS_EXCEPTION_IF_NULL(node); @@ -372,6 +402,32 @@ bool E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons return success; } +bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) { + bool success = false; + auto &dump_json_parser = DumpJsonParser::GetInstance(); + if (dump_json_parser.GetIterDumpFlag()) { + std::string dump_path = GenerateDumpPath(graph_id, rank_id); + DumpInputSingleNode(node, dump_path, debugger); + DumpOutputSingleNode(node, dump_path, debugger); + success = true; + } + return success; +} + +bool E2eDump::DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, + const Debugger *debugger) { + bool success = false; + uint32_t graph_id = graph->graph_id(); + auto &dump_json_parser = DumpJsonParser::GetInstance(); + if (dump_json_parser.GetIterDumpFlag()) { + MS_LOG(INFO) << "DumpParametersAndConst. Current iteration is " << dump_json_parser.cur_dump_iter(); + MS_LOG(INFO) << "Current graph id is " << graph_id; + std::string dump_path = GenerateDumpPath(graph_id, rank_id); + DumpParametersAndConst(graph, dump_path, debugger); + success = true; + } + return success; +} bool E2eDump::isDatasetGraph(const session::KernelGraph *graph) { // check if there is GetNext or InitDataSetQueue node const auto &nodes = graph->execution_order(); diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.h b/mindspore/ccsrc/debug/data_dump/e2e_dump.h index 8ce66809567..efb005ff711 100644 --- a/mindspore/ccsrc/debug/data_dump/e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.h @@ -36,6 +36,12 @@ class E2eDump { ~E2eDump() = default; static void DumpSetup(const session::KernelGraph *graph, uint32_t rank_id); static bool DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr); + + static bool DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger); + + static bool DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, + const Debugger *debugger = nullptr); + static bool isDatasetGraph(const session::KernelGraph *graph); // Dump data when task error. static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, @@ -45,8 +51,13 @@ class E2eDump { private: static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger); + + static void DumpOutputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger); + static void DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger); + static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger); + static void DumpParametersAndConst(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger); diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 8b45ca0065c..2e04ab786e3 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -317,7 +317,6 @@ void Debugger::PreExecuteGraphDebugger(const std::vector &graphs } void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) { // access lock for public method - std::lock_guard a_lock(access_lock_); CheckDatasetSinkMode(); auto graph_id = graph_ptr->graph_id(); @@ -392,7 +391,7 @@ bool Debugger::DumpDataEnabledIteration() const { return false; } -void Debugger::Dump(const KernelGraphPtr &kernel_graph) const { +uint32_t Debugger::GetRankID() { auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); std::string device_target = ms_context->get_param(MS_CTX_DEVICE_TARGET); @@ -400,23 +399,28 @@ void Debugger::Dump(const KernelGraphPtr &kernel_graph) const { const auto &device_context = device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id}); uint32_t rank_id = device_context->GetRankID(); + return rank_id; +} +void Debugger::Dump(const KernelGraphPtr &kernel_graph) const { + uint32_t rank_id = GetRankID(); if (debugger_->DebuggerBackendEnabled()) { MS_EXCEPTION_IF_NULL(kernel_graph); - E2eDump::DumpData(kernel_graph.get(), rank_id, debugger_.get()); + E2eDump::DumpParametersAndConstData(kernel_graph.get(), rank_id, debugger_.get()); } else { DumpJsonParser::GetInstance().UpdateDumpIter(); } } +void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) { + if (debugger_->DebuggerBackendEnabled()) { + uint32_t rank_id = GetRankID(); + E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get()); + } +} + void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const { MS_LOG(INFO) << "Start!"; - auto ms_context = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(ms_context); - std::string device_target = ms_context->get_param(MS_CTX_DEVICE_TARGET); - uint32_t device_id = ms_context->get_param(MS_CTX_DEVICE_ID); - const auto &device_context = - device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id}); - uint32_t rank_id = device_context->GetRankID(); + uint32_t rank_id = GetRankID(); MS_EXCEPTION_IF_NULL(kernel_graph); E2eDump::DumpSetup(kernel_graph.get(), rank_id); MS_LOG(INFO) << "Finish!"; @@ -425,13 +429,7 @@ void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) { // This function will be called for new GPU runtime using MindRTBackend auto &json_parser = DumpJsonParser::GetInstance(); if (json_parser.e2e_dump_enabled()) { - auto ms_context = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(ms_context); - std::string device_target = ms_context->get_param(MS_CTX_DEVICE_TARGET); - uint32_t device_id = ms_context->get_param(MS_CTX_DEVICE_ID); - const auto &device_context = - device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_target, device_id}); - uint32_t rank_id = device_context->GetRankID(); + uint32_t rank_id = GetRankID(); kernel_graph->set_root_graph_id(kernel_graph->graph_id()); std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id()); std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id); @@ -443,23 +441,32 @@ void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) { kernel_graph->execution_order()); } } -void Debugger::PostExecuteGraphDebugger(const std::vector &graphs) { + +void Debugger::PostExecuteGraphDebugger() { // Only GPU is supported for MindRTBackend if (device_target_ != kGPUDevice) { return; } - for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) { - const auto &graph = graphs[graph_index]; - bool dump_enabled = debugger_->DumpDataEnabledIteration(); - // debug used for dump - if (debugger_ && dump_enabled) { + // LoadParametersAndConst for all the graphs + for (auto graph : graph_ptr_list_) { + debugger_->LoadParametersAndConst(graph); + } + bool dump_enabled = debugger_->DumpDataEnabledIteration(); + // debug used for dump + if (debugger_ && dump_enabled) { + // Dump Parameters and consts + for (auto graph : graph_ptr_list_) { debugger_->Dump(graph); - } else { - DumpJsonParser::GetInstance().UpdateDumpIter(); - } - if (debugger_) { - debugger_->PostExecute(); + if (!debugger_->debugger_enabled()) { + debugger_->ClearCurrentData(); + } } + + } else { + DumpJsonParser::GetInstance().UpdateDumpIter(); + } + if (debugger_) { + debugger_->PostExecute(); } } @@ -1340,6 +1347,24 @@ void Debugger::LoadParametersAndConst() { } } +void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) { + if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return; + MS_EXCEPTION_IF_NULL(graph); + // load parameters + MS_LOG(INFO) << "Start to load Parameters for graph " << graph->graph_id(); + const auto ¶meters = graph_ptr_->inputs(); + for (auto &item : parameters) { + LoadSingleAnfnode(item, PARAMETER_OUTPUT_INDEX); + } + // load value nodes + // get all constant avlues from the graph + MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id(); + const auto value_nodes = graph_ptr_->graph_value_nodes(); + for (auto &item : value_nodes) { + LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX); + } +} + void Debugger::LoadGraphOutputs() { if (!(debugger_enabled() && device_target_ == kAscendDevice)) return; MS_EXCEPTION_IF_NULL(graph_ptr_); diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 36eae10242c..c71afc129f8 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -85,13 +85,17 @@ class Debugger : public std::enable_shared_from_this { bool DumpDataEnabledIteration() const; + static uint32_t GetRankID(); + void Dump(const KernelGraphPtr &kernel_graph) const; + void DumpSingleNode(const CNodePtr &node, uint32_t graph_id); + void DumpSetup(const KernelGraphPtr &kernel_graph) const; void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph); - void PostExecuteGraphDebugger(const std::vector &graphs); + void PostExecuteGraphDebugger(); bool ReadNodeDataRequired(const CNodePtr &kernel) const; @@ -141,6 +145,8 @@ class Debugger : public std::enable_shared_from_this { void LoadParametersAndConst(); + void LoadParametersAndConst(const KernelGraphPtr &graph); + void UpdateStepNum(const session::KernelGraph *graph); void UpdateStepNumGPU(); diff --git a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc index fc49686083c..daa64162195 100644 --- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc @@ -27,6 +27,7 @@ using mindspore::kernel::AddressPtr; using AddressPtrList = std::vector; +using KernelGraph = mindspore::session::KernelGraph; #endif namespace mindspore { namespace runtime { @@ -100,6 +101,56 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, ui } } } + +bool CheckReadData(const CNodePtr &cnode) { + auto debugger = Debugger::GetInstance(); + if (!debugger) { + return false; + } + bool read_data = false; + auto &dump_json_parser = DumpJsonParser::GetInstance(); + bool dump_enabled = debugger->DumpDataEnabledIteration(); + std::string kernel_name = cnode->fullname_with_scope(); + if (dump_enabled) { + auto dump_mode = dump_json_parser.dump_mode(); + // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list + if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) { + read_data = true; + } + } else if (debugger->debugger_enabled()) { + read_data = debugger->ReadNodeDataRequired(cnode); + } + return read_data; +} + +void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_) { + auto debugger = Debugger::GetInstance(); + if (!debugger) { + return; + } + auto &dump_json_parser = DumpJsonParser::GetInstance(); + bool dump_enabled = debugger->DumpDataEnabledIteration(); + if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) { + LoadInputs(cnode, launch_info_, exec_order_); + } + if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) { + LoadOutputs(cnode, launch_info_, exec_order_); + } + // Dump kernel + if (dump_enabled) { + auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); + MS_EXCEPTION_IF_NULL(kernel_graph); + auto graph_id = kernel_graph->graph_id(); + debugger->DumpSingleNode(cnode, graph_id); + // Clear Dumped data when online debugger is not enabled + if (!debugger->debugger_enabled()) { + debugger->ClearCurrentData(); + } + } + // check if the node is last kernel + bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip"); + debugger->PostExecuteNode(cnode, last_kernel); +} #endif void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_, @@ -108,36 +159,19 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in MS_EXCEPTION_IF_NULL(device_context); MS_EXCEPTION_IF_NULL(op_context); MS_EXCEPTION_IF_NULL(from_aid); -// todo debug. + // todo debug. + MS_LOG(INFO) << "DebugActor is called"; #ifdef ENABLE_GPU if (node->isa()) { const auto &cnode = node->cast(); auto debugger = Debugger::GetInstance(); if (debugger) { std::string kernel_name = cnode->fullname_with_scope(); + MS_LOG(INFO) << "kernel_name is " << kernel_name; debugger->SetCurNode(kernel_name); - bool read_data = false; - auto &dump_json_parser = DumpJsonParser::GetInstance(); - bool dump_enabled = debugger->DumpDataEnabledIteration(); - if (dump_enabled) { - auto dump_mode = dump_json_parser.dump_mode(); - // dump the node if dump_mode is 0, which means all kernels, or if this kernel is in the kernels list - if ((dump_mode == 0) || ((dump_mode == 1) && dump_json_parser.NeedDump(kernel_name))) { - read_data = true; - } - } else if (debugger->debugger_enabled()) { - read_data = debugger->ReadNodeDataRequired(cnode); - } + bool read_data = CheckReadData(cnode); if (read_data) { - if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) { - LoadInputs(cnode, launch_info_, exec_order_); - } - if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) { - LoadOutputs(cnode, launch_info_, exec_order_); - } - // check if the node is last kernel - bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip"); - debugger->PostExecuteNode(cnode, last_kernel); + ReadDataAndDump(cnode, launch_info_, exec_order_); } } exec_order_ += 1; @@ -150,14 +184,15 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in void DebugActor::DebugOnStepEnd(OpContext *op_context, const AID *from_aid) { MS_EXCEPTION_IF_NULL(op_context); MS_EXCEPTION_IF_NULL(from_aid); -// todo debug. + // todo debug. + MS_LOG(INFO) << "DebugActor::DebugOnStepEnd is called"; #ifdef ENABLE_GPU auto debugger = Debugger::GetInstance(); if (debugger) { debugger->Debugger::UpdateStepNumGPU(); - debugger->Debugger::LoadParametersAndConst(); // Reset exec_order for the next step exec_order_ = 0; + debugger->Debugger::PostExecuteGraphDebugger(); } #endif // Call back to the from actor to process after debug finished. diff --git a/mindspore/ccsrc/runtime/framework/graph_compiler.cc b/mindspore/ccsrc/runtime/framework/graph_compiler.cc index 57dbfa46046..9d00ff77c29 100644 --- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc +++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc @@ -304,16 +304,15 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic } graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get())); -#ifdef ENABLE_DEBUGGER - auto debugger = Debugger::GetInstance(); - debugger->DumpInGraphCompiler(graph); -#endif + MS_EXCEPTION_IF_NULL(session_); session_->InitAllBucket(graph, device_context); session_->SetSummaryNodes(graph.get()); SetSummaryNodesRefCount(graph.get()); #ifdef ENABLE_DEBUGGER + auto debugger = Debugger::GetInstance(); + debugger->DumpInGraphCompiler(graph); if (debugger && debugger->DebuggerBackendEnabled()) { debugger->LoadGraphs(graph); } diff --git a/mindspore/ccsrc/vm/backend.cc b/mindspore/ccsrc/vm/backend.cc index 64a8e302812..9d9f8185325 100644 --- a/mindspore/ccsrc/vm/backend.cc +++ b/mindspore/ccsrc/vm/backend.cc @@ -606,12 +606,6 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args, if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) { MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_; } -// Debugger post-execute graph. -#ifdef ENABLE_DEBUGGER - if (Debugger::GetInstance()->DebuggerBackendEnabled()) { - Debugger::GetInstance()->PostExecuteGraphDebugger(graph_compiler_info.graphs_); - } -#endif // Sync device stream. const auto &first_device_context = graph_compiler_info.device_contexts_[0];