From e761655a4208e6af35144e464b5a413588edbdf1 Mon Sep 17 00:00:00 2001 From: limingqi107 Date: Wed, 30 Jun 2021 18:09:54 +0800 Subject: [PATCH] actor runtime support CPU dump --- .../ccsrc/debug/data_dump/cpu_e2e_dump.cc | 10 ++++++ .../ccsrc/debug/data_dump/cpu_e2e_dump.h | 2 ++ .../ccsrc/debug/data_dump/dump_json_parser.h | 8 +++++ .../runtime/framework/actor/debug_actor.cc | 31 ++++++++++++++++--- .../ccsrc/runtime/framework/graph_compiler.cc | 5 +-- .../runtime/framework/graph_scheduler.cc | 24 ++++++++------ .../hardware/cpu/cpu_device_context.cc | 10 ++++++ .../runtime/hardware/cpu/cpu_memory_pool.cc | 4 +++ .../hardware/gpu/gpu_device_context.cc | 3 +- mindspore/ccsrc/vm/backend.cc | 27 ++++++++++++---- tests/st/dump/test_data_dump.py | 1 - 11 files changed, 99 insertions(+), 26 deletions(-) diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc index 0ef3bbebc94..2f7751f3c0e 100644 --- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc +++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc @@ -134,6 +134,9 @@ void CPUE2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t outp } auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index); MS_EXCEPTION_IF_NULL(addr); + if (addr->GetPtr() == nullptr) { + return; + } ShapeVector int_shapes; GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes)); auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index); @@ -164,4 +167,11 @@ void CPUE2eDump::DumpParametersAndConst(const session::KernelGraph *graph, uint3 DumpSingleAnfNode(value_node, VALUE_NODE_OUTPUT_INDEX, dump_path, &const_map); } } + +void CPUE2eDump::DumpParametersAndConst() { + auto &graphs = DumpJsonParser::GetInstance().graphs(); + for (auto graph : graphs) { + DumpParametersAndConst(graph, graph->graph_id()); + } +} } // namespace mindspore diff --git a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h index f1b2c0f3b95..17d0aad2c2a 100644 --- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h +++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h @@ -31,6 +31,8 @@ class CPUE2eDump { // Dump data when task error. static void DumpParametersAndConst(const session::KernelGraph *graph, uint32_t graph_id); + static void DumpParametersAndConst(); + static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id); private: diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h index 72ca3955434..cc0524b9be2 100644 --- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h +++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "nlohmann/json.hpp" #include "utils/ms_utils.h" #include "backend/session/kernel_graph.h" @@ -62,6 +63,10 @@ class DumpJsonParser { std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const; void UpdateNeedDumpKernels(NotNull kernel_graph); + void ClearGraph() { graphs_.clear(); } + void SaveGraph(session::KernelGraph *graph) { graphs_.emplace_back(graph); } + std::vector &graphs() { return graphs_; } + private: DumpJsonParser() = default; ~DumpJsonParser() = default; @@ -82,6 +87,9 @@ class DumpJsonParser { uint32_t cur_dump_iter_{0}; bool already_parsed_{false}; + // Save graphs for dump. + std::vector graphs_; + void ParseCommonDumpSetting(const nlohmann::json &content); void ParseE2eDumpSetting(const nlohmann::json &content); bool IsDumpEnabled(); diff --git a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc index 27c3bba1e81..f9e23452079 100644 --- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc @@ -21,6 +21,7 @@ #include "runtime/framework/actor/debug_aware_actor.h" #include "mindrt/include/async/async.h" #include "utils/log_adapter.h" +#include "debug/data_dump/cpu_e2e_dump.h" #ifdef ENABLE_DEBUGGER #include "debug/debugger/debugger.h" #include "debug/debugger/debugger_utils.h" @@ -35,10 +36,22 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in MS_EXCEPTION_IF_NULL(device_context); MS_EXCEPTION_IF_NULL(op_context); MS_EXCEPTION_IF_NULL(from_aid); - // todo debug. + + if (!node->isa()) { + // Call back to the from actor to process after debug finished. + Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context); + return; + } + + const auto &cnode = node->cast(); + if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kCPU) { + if (DumpJsonParser::GetInstance().GetIterDumpFlag()) { + auto kernel_graph = std::dynamic_pointer_cast(cnode->func_graph()); + MS_EXCEPTION_IF_NULL(kernel_graph); + CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id()); + } + } else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) { #ifdef ENABLE_DEBUGGER - if (node->isa()) { - const auto &cnode = node->cast(); auto debugger = Debugger::GetInstance(); if (debugger) { std::string kernel_name = cnode->fullname_with_scope(); @@ -49,8 +62,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in } } exec_order_ += 1; - } #endif + } + // Call back to the from actor to process after debug finished. Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context); } @@ -58,7 +72,11 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in void DebugActor::DebugOnStepEnd(OpContext *op_context, const AID *from_aid) { MS_EXCEPTION_IF_NULL(op_context); MS_EXCEPTION_IF_NULL(from_aid); - // todo debug. + + if (DumpJsonParser::GetInstance().GetIterDumpFlag()) { + CPUE2eDump::DumpParametersAndConst(); + } + #ifdef ENABLE_DEBUGGER auto debugger = Debugger::GetInstance(); if (debugger) { @@ -67,7 +85,10 @@ void DebugActor::DebugOnStepEnd(OpContext *op_context, const AID * exec_order_ = 0; debugger->Debugger::PostExecuteGraphDebugger(); } +#else + DumpJsonParser::GetInstance().UpdateDumpIter(); #endif + // Call back to the from actor to process after debug finished. Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context); } diff --git a/mindspore/ccsrc/runtime/framework/graph_compiler.cc b/mindspore/ccsrc/runtime/framework/graph_compiler.cc index 936f5735e95..39516a32376 100644 --- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc +++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc @@ -299,12 +299,9 @@ GraphId GraphCompiler::CompileGraph(const AnfNodePtrList &nodes, const AnfNodePt GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const { MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(device_context); - - auto &json_parser = DumpJsonParser::GetInstance(); - json_parser.Parse(); - const auto &ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); + bool save_graphs = ms_context->get_param(MS_CTX_SAVE_GRAPHS_FLAG); // Dump .pb graph before graph optimization. if (save_graphs) { diff --git a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc index e04ddfd9afc..e6c2911e982 100644 --- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc +++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc @@ -28,12 +28,14 @@ #include "utils/convert_utils.h" #include "utils/ms_context.h" #include "common/trans.h" +#include "debug/data_dump/dump_json_parser.h" #ifdef ENABLE_DUMP_IR #include "debug/rdr/recorder_manager.h" #endif #ifdef ENABLE_DEBUGGER #include "debug/debugger/debugger.h" #endif + namespace mindspore { namespace runtime { namespace { @@ -480,10 +482,14 @@ void GraphScheduler::Initialize() { (void)actorMgr->Spawn(base_recorder_actor, true); } #endif -// Create and schedule debug actor. + // Create and schedule debug actor. + bool debugger_actor_need = DumpJsonParser::GetInstance().e2e_dump_enabled(); #ifdef ENABLE_DEBUGGER - auto debugger = mindspore::Debugger::GetInstance(); - if (debugger->DebuggerBackendEnabled()) { + if (Debugger::GetInstance()->DebuggerBackendEnabled()) { + debugger_actor_need = true; + } +#endif + if (debugger_actor_need) { auto debug_actor = std::make_shared(); MS_EXCEPTION_IF_NULL(debug_actor); debug_aid_ = &(debug_actor->GetAID()); @@ -491,7 +497,6 @@ void GraphScheduler::Initialize() { base_debug_actor->set_thread_pool(thread_pool_); (void)actorMgr->Spawn(base_debug_actor, true); } -#endif } ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info) { @@ -1343,7 +1348,8 @@ void GraphScheduler::LinkDataArrow(KernelActor *to_actor, const GraphCompilerInf const auto &from_actor = dynamic_cast(FetchActor(from_kernel->fullname_with_scope())); LinkDataArrowForKernelActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx); } else if (IsInternalParameter(from_kernel, graph)) { - // Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to link. + // Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to + // link. LinkDataArrowForInternalParameter(from_kernel, graph_compiler_info.origin_parameters_order_, graph, to_actor, to_kernel_with_input_idx); } else if (IsPersistentDeviceTensor(from_kernel)) { @@ -1602,8 +1608,8 @@ void GraphScheduler::LinkControlArrowByAutoMonad(KernelActor *to_actor, const An if (AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimDepend) || AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimLoad)) { real_depend_inputs.push_back(input_cnode->input(kDependAttachNodeIndex)); - // The real input may be this scene: depend/load --> load/depend, so need add the control arrow for real input node - // in this scene. + // The real input may be this scene: depend/load --> load/depend, so need add the control arrow for real input + // node in this scene. if (AnfAlgo::IsOneOfPrimitiveCNode(input_cnode->input(kRealInputIndexInDepend), recursion_prims)) { real_depend_inputs.push_back(input_cnode->input(kRealInputIndexInDepend)); } @@ -1707,8 +1713,8 @@ void GraphScheduler::LinkControlArrowBySendRecvNodes(const KernelGraphPtr &graph output_actor->input_controls_num_++; } - // In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be reused - // only when the recv node runs finished, which is expressed by the reference count increased. + // In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be + // reused only when the recv node runs finished, which is expressed by the reference count increased. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(from_allreduce_node); ++i) { auto device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(from_allreduce_node, i, false); MS_EXCEPTION_IF_NULL(device_tensor); diff --git a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc index 09ca9b7676a..eea6ed71ed8 100644 --- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc @@ -29,6 +29,7 @@ #include "backend/optimizer/pass/replace_node_by_proxy.h" #include "backend/optimizer/pass/erase_visit_attr.h" #include "profiler/device/cpu/cpu_profiling.h" +#include "debug/data_dump/dump_json_parser.h" namespace mindspore { namespace device { @@ -37,8 +38,17 @@ bool CPUDeviceContext::Initialize() { if (initialized_) { return true; } + mem_manager_ = std::make_shared(); MS_EXCEPTION_IF_NULL(mem_manager_); + + // Dump json config file if dump is enabled. + auto rank_id = GetRankID(); + auto &json_parser = DumpJsonParser::GetInstance(); + json_parser.Parse(); + json_parser.CopyJsonToDir(rank_id); + json_parser.CopyMSCfgJsonToDir(rank_id); + initialized_ = true; return true; } diff --git a/mindspore/ccsrc/runtime/hardware/cpu/cpu_memory_pool.cc b/mindspore/ccsrc/runtime/hardware/cpu/cpu_memory_pool.cc index 455f2227a2d..32497e5c401 100644 --- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_memory_pool.cc +++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_memory_pool.cc @@ -26,6 +26,9 @@ const size_t kKBToByte = 1024; const size_t kLineMaxSize = 1024; size_t GetSystemMemorySize(const std::string &key) { +#if defined(_WIN32) || defined(_WIN64) + return SIZE_MAX; +#else FILE *file = fopen("/proc/meminfo", "r"); if (file == nullptr) { MS_LOG(EXCEPTION) << "Get system meminfo failed."; @@ -53,6 +56,7 @@ size_t GetSystemMemorySize(const std::string &key) { fclose(file); return mem_size * kKBToByte; +#endif } } // namespace diff --git a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc index 6bc13fd8c36..6d34080a308 100644 --- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc +++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc @@ -92,9 +92,10 @@ bool GPUDeviceContext::Initialize() { (*init_nccl_comm_funcptr)(); } + // Dump json config file if dump is enabled. auto rank_id = GetRankID(); auto &json_parser = DumpJsonParser::GetInstance(); - // Dump json config file if dump is enabled + json_parser.Parse(); json_parser.CopyJsonToDir(rank_id); json_parser.CopyMSCfgJsonToDir(rank_id); diff --git a/mindspore/ccsrc/vm/backend.cc b/mindspore/ccsrc/vm/backend.cc index b79b5d4f14c..337390d06d3 100644 --- a/mindspore/ccsrc/vm/backend.cc +++ b/mindspore/ccsrc/vm/backend.cc @@ -39,6 +39,8 @@ #ifdef ENABLE_DEBUGGER #include "debug/debugger/debugger.h" #endif +#include "debug/data_dump/dump_json_parser.h" + namespace mindspore { namespace compile { bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); } @@ -602,6 +604,23 @@ bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr &graph_output, const Vec } return false; } + +void PrepareForDebuggr(const GraphCompilerInfo &graph_compiler_info) { +#ifdef ENABLE_DEBUGGER + if (Debugger::GetInstance()->DebuggerBackendEnabled()) { + Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_); + } +#endif + + if (DumpJsonParser::GetInstance().e2e_dump_enabled()) { + DumpJsonParser::GetInstance().ClearGraph(); + for (size_t i = 0; i < graph_compiler_info.graphs_.size(); ++i) { + if (graph_compiler_info.device_contexts_[i]->GetDeviceAddressType() == device::DeviceAddressType::kCPU) { + DumpJsonParser::GetInstance().SaveGraph(graph_compiler_info.graphs_[i].get()); + } + } + } +} } // namespace void MindRTBackend::RunGraphBySingleOp(const std::vector &graphs, @@ -719,12 +738,8 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args, const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info); MS_EXCEPTION_IF_NULL(actor_set); runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors); -// Debugger pre-execute graph. -#ifdef ENABLE_DEBUGGER - if (Debugger::GetInstance()->DebuggerBackendEnabled()) { - Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_); - } -#endif + // Debugger pre-execute graph. + PrepareForDebuggr(graph_compiler_info); if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) { MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_; } diff --git a/tests/st/dump/test_data_dump.py b/tests/st/dump/test_data_dump.py index b5f9bd4ee90..6c6dae8c014 100644 --- a/tests/st/dump/test_data_dump.py +++ b/tests/st/dump/test_data_dump.py @@ -116,7 +116,6 @@ def test_cpu_e2e_dump(): context.set_context(mode=context.GRAPH_MODE, device_target="CPU") run_e2e_dump() - class ReluReduceMeanDenseRelu(Cell): def __init__(self, kernel, bias, in_channel, num_class): super().__init__()