actor runtime support CPU dump

2021-06-30 18:09:54 +08:00 · 2021-06-30 18:09:54 +08:00 · e761655a42
parent 8008103050
commit e761655a42
11 changed files with 99 additions and 26 deletions
--- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.cc
@ -134,6 +134,9 @@ void CPUE2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t outp
  }
  auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
  MS_EXCEPTION_IF_NULL(addr);
+  if (addr->GetPtr() == nullptr) {
+    return;
+  }
  ShapeVector int_shapes;
  GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes));
  auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
@ -164,4 +167,11 @@ void CPUE2eDump::DumpParametersAndConst(const session::KernelGraph *graph, uint3
    DumpSingleAnfNode(value_node, VALUE_NODE_OUTPUT_INDEX, dump_path, &const_map);
  }
 }
+
+void CPUE2eDump::DumpParametersAndConst() {
+  auto &graphs = DumpJsonParser::GetInstance().graphs();
+  for (auto graph : graphs) {
+    DumpParametersAndConst(graph, graph->graph_id());
+  }
+}
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h
+++ b/mindspore/ccsrc/debug/data_dump/cpu_e2e_dump.h
@ -31,6 +31,8 @@ class CPUE2eDump {
  // Dump data when task error.
  static void DumpParametersAndConst(const session::KernelGraph *graph, uint32_t graph_id);

+  static void DumpParametersAndConst();
+
  static void DumpCNodeData(const CNodePtr &node, uint32_t graph_id);

 private:
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.h
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.h
@ -21,6 +21,7 @@
 #include <map>
 #include <set>
 #include <mutex>
+#include <vector>
 #include "nlohmann/json.hpp"
 #include "utils/ms_utils.h"
 #include "backend/session/kernel_graph.h"
@ -62,6 +63,10 @@ class DumpJsonParser {
  std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
  void UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph);

+  void ClearGraph() { graphs_.clear(); }
+  void SaveGraph(session::KernelGraph *graph) { graphs_.emplace_back(graph); }
+  std::vector<session::KernelGraph *> &graphs() { return graphs_; }
+
 private:
  DumpJsonParser() = default;
  ~DumpJsonParser() = default;
@ -82,6 +87,9 @@ class DumpJsonParser {
  uint32_t cur_dump_iter_{0};
  bool already_parsed_{false};

+  // Save graphs for dump.
+  std::vector<session::KernelGraph *> graphs_;
+
  void ParseCommonDumpSetting(const nlohmann::json &content);
  void ParseE2eDumpSetting(const nlohmann::json &content);
  bool IsDumpEnabled();
--- a/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/debug_actor.cc
@ -21,6 +21,7 @@
 #include "runtime/framework/actor/debug_aware_actor.h"
 #include "mindrt/include/async/async.h"
 #include "utils/log_adapter.h"
+#include "debug/data_dump/cpu_e2e_dump.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #include "debug/debugger/debugger_utils.h"
@ -35,10 +36,22 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
  MS_EXCEPTION_IF_NULL(device_context);
  MS_EXCEPTION_IF_NULL(op_context);
  MS_EXCEPTION_IF_NULL(from_aid);
-  // todo debug.
+
+  if (!node->isa<CNode>()) {
+    // Call back to the from actor to process after debug finished.
+    Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
+    return;
+  }
+
+  const auto &cnode = node->cast<CNodePtr>();
+  if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kCPU) {
+    if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
+      auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
+      MS_EXCEPTION_IF_NULL(kernel_graph);
+      CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
+    }
+  } else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
 #ifdef ENABLE_DEBUGGER
-  if (node->isa<CNode>()) {
-    const auto &cnode = node->cast<CNodePtr>();
    auto debugger = Debugger::GetInstance();
    if (debugger) {
      std::string kernel_name = cnode->fullname_with_scope();
@ -49,8 +62,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
      }
    }
    exec_order_ += 1;
-  }
 #endif
+  }
+
  // Call back to the from actor to process after debug finished.
  Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
 }
@ -58,7 +72,11 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
 void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *from_aid) {
  MS_EXCEPTION_IF_NULL(op_context);
  MS_EXCEPTION_IF_NULL(from_aid);
-  // todo debug.
+
+  if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
+    CPUE2eDump::DumpParametersAndConst();
+  }
+
 #ifdef ENABLE_DEBUGGER
  auto debugger = Debugger::GetInstance();
  if (debugger) {
@ -67,7 +85,10 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *op_context, const AID *
    exec_order_ = 0;
    debugger->Debugger::PostExecuteGraphDebugger();
  }
+#else
+  DumpJsonParser::GetInstance().UpdateDumpIter();
 #endif
+
  // Call back to the from actor to process after debug finished.
  Async(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
 }
--- a/mindspore/ccsrc/runtime/framework/graph_compiler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_compiler.cc
@ -299,12 +299,9 @@ GraphId GraphCompiler::CompileGraph(const AnfNodePtrList &nodes, const AnfNodePt
 GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const DeviceContext *device_context) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(device_context);
-
-  auto &json_parser = DumpJsonParser::GetInstance();
-  json_parser.Parse();
-
  const auto &ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
+
  bool save_graphs = ms_context->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
  // Dump .pb graph before graph optimization.
  if (save_graphs) {
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@ -28,12 +28,14 @@
 #include "utils/convert_utils.h"
 #include "utils/ms_context.h"
 #include "common/trans.h"
+#include "debug/data_dump/dump_json_parser.h"
 #ifdef ENABLE_DUMP_IR
 #include "debug/rdr/recorder_manager.h"
 #endif
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
+
 namespace mindspore {
 namespace runtime {
 namespace {
@ -480,10 +482,14 @@ void GraphScheduler::Initialize() {
    (void)actorMgr->Spawn(base_recorder_actor, true);
  }
 #endif
-// Create and schedule debug actor.
+  // Create and schedule debug actor.
+  bool debugger_actor_need = DumpJsonParser::GetInstance().e2e_dump_enabled();
 #ifdef ENABLE_DEBUGGER
-  auto debugger = mindspore::Debugger::GetInstance();
-  if (debugger->DebuggerBackendEnabled()) {
+  if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
+    debugger_actor_need = true;
+  }
+#endif
+  if (debugger_actor_need) {
    auto debug_actor = std::make_shared<DebugActor>();
    MS_EXCEPTION_IF_NULL(debug_actor);
    debug_aid_ = &(debug_actor->GetAID());
@ -491,7 +497,6 @@ void GraphScheduler::Initialize() {
    base_debug_actor->set_thread_pool(thread_pool_);
    (void)actorMgr->Spawn(base_debug_actor, true);
  }
-#endif
 }

 ActorSet *GraphScheduler::Transform(const GraphCompilerInfo &graph_compiler_info) {
@ -1343,7 +1348,8 @@ void GraphScheduler::LinkDataArrow(KernelActor *to_actor, const GraphCompilerInf
    const auto &from_actor = dynamic_cast<KernelActor *>(FetchActor(from_kernel->fullname_with_scope()));
    LinkDataArrowForKernelActor(from_actor, to_actor, from_kernel_with_output_idx, to_kernel_with_input_idx);
  } else if (IsInternalParameter(from_kernel, graph)) {
-    // Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to link.
+    // Link data arrow for internal parameter, convert internal parameter to actor by internal parameter cache to
+    // link.
    LinkDataArrowForInternalParameter(from_kernel, graph_compiler_info.origin_parameters_order_, graph, to_actor,
                                      to_kernel_with_input_idx);
  } else if (IsPersistentDeviceTensor(from_kernel)) {
@ -1602,8 +1608,8 @@ void GraphScheduler::LinkControlArrowByAutoMonad(KernelActor *to_actor, const An
  if (AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimDepend) ||
      AnfAlgo::CheckPrimitiveType(input_cnode, prim::kPrimLoad)) {
    real_depend_inputs.push_back(input_cnode->input(kDependAttachNodeIndex));
-    // The real input may be this scene:  depend/load --> load/depend, so need add the control arrow for real input node
-    // in this scene.
+    // The real input may be this scene:  depend/load --> load/depend, so need add the control arrow for real input
+    // node in this scene.
    if (AnfAlgo::IsOneOfPrimitiveCNode(input_cnode->input(kRealInputIndexInDepend), recursion_prims)) {
      real_depend_inputs.push_back(input_cnode->input(kRealInputIndexInDepend));
    }
@ -1707,8 +1713,8 @@ void GraphScheduler::LinkControlArrowBySendRecvNodes(const KernelGraphPtr &graph
      output_actor->input_controls_num_++;
    }

-    // In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be reused
-    // only when the recv node runs finished, which is expressed by the reference count increased.
+    // In the scene of allreduce op and computing op parallel multi stream, the input memory of allreduce can be
+    // reused only when the recv node runs finished, which is expressed by the reference count increased.
    for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(from_allreduce_node); ++i) {
      auto device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(from_allreduce_node, i, false);
      MS_EXCEPTION_IF_NULL(device_tensor);
--- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_device_context.cc
@ -29,6 +29,7 @@
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
 #include "backend/optimizer/pass/erase_visit_attr.h"
 #include "profiler/device/cpu/cpu_profiling.h"
+#include "debug/data_dump/dump_json_parser.h"

 namespace mindspore {
 namespace device {
@ -37,8 +38,17 @@ bool CPUDeviceContext::Initialize() {
  if (initialized_) {
    return true;
  }
+
  mem_manager_ = std::make_shared<CPUMemoryManager>();
  MS_EXCEPTION_IF_NULL(mem_manager_);
+
+  // Dump json config file if dump is enabled.
+  auto rank_id = GetRankID();
+  auto &json_parser = DumpJsonParser::GetInstance();
+  json_parser.Parse();
+  json_parser.CopyJsonToDir(rank_id);
+  json_parser.CopyMSCfgJsonToDir(rank_id);
+
  initialized_ = true;
  return true;
 }
--- a/mindspore/ccsrc/runtime/hardware/cpu/cpu_memory_pool.cc
+++ b/mindspore/ccsrc/runtime/hardware/cpu/cpu_memory_pool.cc
@ -26,6 +26,9 @@ const size_t kKBToByte = 1024;
 const size_t kLineMaxSize = 1024;

 size_t GetSystemMemorySize(const std::string &key) {
+#if defined(_WIN32) || defined(_WIN64)
+  return SIZE_MAX;
+#else
  FILE *file = fopen("/proc/meminfo", "r");
  if (file == nullptr) {
    MS_LOG(EXCEPTION) << "Get system meminfo failed.";
@ -53,6 +56,7 @@ size_t GetSystemMemorySize(const std::string &key) {

  fclose(file);
  return mem_size * kKBToByte;
+#endif
 }
 }  // namespace

--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@ -92,9 +92,10 @@ bool GPUDeviceContext::Initialize() {
    (*init_nccl_comm_funcptr)();
  }

+  // Dump json config file if dump is enabled.
  auto rank_id = GetRankID();
  auto &json_parser = DumpJsonParser::GetInstance();
-  // Dump json config file if dump is enabled
+  json_parser.Parse();
  json_parser.CopyJsonToDir(rank_id);
  json_parser.CopyMSCfgJsonToDir(rank_id);

--- a/mindspore/ccsrc/vm/backend.cc
+++ b/mindspore/ccsrc/vm/backend.cc
@ -39,6 +39,8 @@
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
+#include "debug/data_dump/dump_json_parser.h"
+
 namespace mindspore {
 namespace compile {
 bool Backend::GetCond(const BaseRef &c, bool *const value) { return BaseRefToBool(c, value); }
@ -602,6 +604,23 @@ bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr &graph_output, const Vec
  }
  return false;
 }
+
+void PrepareForDebuggr(const GraphCompilerInfo &graph_compiler_info) {
+#ifdef ENABLE_DEBUGGER
+  if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
+    Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_);
+  }
+#endif
+
+  if (DumpJsonParser::GetInstance().e2e_dump_enabled()) {
+    DumpJsonParser::GetInstance().ClearGraph();
+    for (size_t i = 0; i < graph_compiler_info.graphs_.size(); ++i) {
+      if (graph_compiler_info.device_contexts_[i]->GetDeviceAddressType() == device::DeviceAddressType::kCPU) {
+        DumpJsonParser::GetInstance().SaveGraph(graph_compiler_info.graphs_[i].get());
+      }
+    }
+  }
+}
 }  // namespace

 void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,
@ -719,12 +738,8 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &args,
  const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
  MS_EXCEPTION_IF_NULL(actor_set);
  runtime::GraphScheduler::GetInstance().PrepareRun(actor_set, graph_compiler_info, input_tensors);
-// Debugger pre-execute graph.
-#ifdef ENABLE_DEBUGGER
-  if (Debugger::GetInstance()->DebuggerBackendEnabled()) {
-    Debugger::GetInstance()->PreExecuteGraphDebugger(graph_compiler_info.graphs_);
-  }
-#endif
+  // Debugger pre-execute graph.
+  PrepareForDebuggr(graph_compiler_info);
  if (!runtime::GraphScheduler::GetInstance().Run(actor_set)) {
    MS_LOG(EXCEPTION) << "The actor runs failed, actor name: " << actor_set->name_;
  }
--- a/tests/st/dump/test_data_dump.py
+++ b/tests/st/dump/test_data_dump.py
@ -116,7 +116,6 @@ def test_cpu_e2e_dump():
    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
    run_e2e_dump()

-
 class ReluReduceMeanDenseRelu(Cell):
    def __init__(self, kernel, bias, in_channel, num_class):
        super().__init__()