!49787 add pynative mode operator overflow check for dump

Merge pull request !49787 from maoyaomin/mym_debugger_kernel_dumper
2023-03-08 06:42:11 +00:00 · 2023-03-08 06:42:11 +00:00 · 9236c3b4e9
parent f58509ff73 67bfdcab09
commit 9236c3b4e9
10 changed files with 55 additions and 62 deletions
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
@ -88,9 +88,6 @@ bool DumpJsonParser::IsDumpEnabled() {

  auto context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context);
-  if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
-    MS_LOG(EXCEPTION) << "Dump is disabled in PyNative mode. Please set mode to GRAPH_MODE in context.";
-  }
  return true;
 }

--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@ -555,12 +555,8 @@ void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
    return;
  }
  std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
-  std::string graph_str;
-  if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
-    graph_str = std::to_string(graph->graph_id());
-  } else {
-    graph_str = IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
-  }
+  std::string graph_str =
+    IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
  std::string file_name_to_check = execution_order_path + "/ms_global_execution_order_graph_" + graph_str + ".csv";
  auto real_path = Common::CreatePrefixPath(file_name_to_check);
  if (!real_path.has_value()) {
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -513,8 +513,13 @@ void Debugger::DumpParamsAndConstAndHistory() {
  for (auto kernel_graph = executed_graph_ptr_set_.cbegin(); kernel_graph != executed_graph_ptr_set_.cend();
       ++kernel_graph) {
    // Dump graph run hisotry for each graph.
+    if (Debugger::GetInstance()->GetAscendKernelByKernelFlag() &&
+        (*kernel_graph)->graph_id() != (*kernel_graph)->root_graph_id()) {
+      MS_LOG(INFO) << "current graph graph_id = " << (*kernel_graph)->graph_id() << " is not root graph.";
+    } else {
      E2eDump::DumpRunIter(*kernel_graph, GetRankID());
    }
+  }
  if (!cur_root_graph_checked) {
    visited_root_graph_ids_.push_back(cur_root_graph_id_);
  }
--- a/mindspore/ccsrc/debug/debugger/debugger_utils.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.cc
@ -215,7 +215,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
    MS_EXCEPTION_IF_NULL(kernel_graph);
    auto graph_id = kernel_graph->graph_id();
    // for GPU, nodes are dumped in graph_id directory.
-    if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
+    if (IsDeviceTargetGPU()) {
      debugger->DumpSingleNode(cnode, graph_id);
    } else {
      // for Ascend, node are dumped in root_graph_id directory.
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc
@ -16,6 +16,7 @@

 #include "plugin/device/ascend/hal/device/dump/kernel_dumper.h"
 #include <algorithm>
+#include <utility>
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/dump_json_parser.h"
 #endif
@ -44,7 +45,7 @@ static constexpr uint64_t kOpDebugMemorySize = 2048;
 const size_t kDebugP2pSize = 8UL;
 }  // namespace
 DUMPER_REG(kAscendDevice, KernelDumper);
-std::mutex KernelDumper::debug_register_mutex_;
+std::mutex KernelDumper::dumper_mutex_;
 std::map<rtStream_t, std::unique_ptr<OpDebugTask>> KernelDumper::op_debug_tasks;
 std::map<uint32_t, bool> KernelDumper::is_data_map;
 std::map<std::string, std::string> KernelDumper::stream_task_graphs;
@ -80,9 +81,17 @@ KernelDumper::~KernelDumper() {
 }

 void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
-  std::lock_guard<std::mutex> lock(debug_register_mutex_);
-  aicpu::dump::OpMappingInfo dump_info;
-  SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
+  auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
+  if (stream == nullptr) {
+    stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
+  }
+  if (DumpJsonParser::GetInstance().op_debug_mode() > 0) {
+    auto rt_ret = rtStreamSynchronize(stream);
+    dumper_mutex_.unlock();
+    if (rt_ret != ACL_ERROR_RT_AICORE_OVER_FLOW) {
+      return;
+    }
+  }

  if (!KernelNeedDump(kernel)) {
    return;
@ -91,10 +100,9 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
    MS_LOG(WARNING) << "[KernelDumper] kernel [" << kernel->UniqueName() << "] is a non-task node, skip dump.";
    return;
  }
-  auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
-  if (stream == nullptr) {
-    stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
-  }
+  aicpu::dump::OpMappingInfo dump_info;
+  SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
+
  DumpJsonParser::GetInstance().MatchKernel(kernel->fullname_with_scope());
  aicpu::dump::Task task;
  ConstructDumpTask(NOT_NULL(kernel), NOT_NULL(&task));
@ -105,7 +113,7 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
  graph_id_ = AnfAlgo::GetGraphId(kernel.get());
  std::string stream_task_id = std::to_string(stream_id_) + std::to_string(task_id_);
  KernelDumper::stream_task_graphs.emplace(stream_task_id, kernel->fullname_with_scope());
-  MS_LOG(INFO) << "[DataDump] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
+  MS_LOG(INFO) << "[KernelDumper] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
               << " task_id:" << task_id_ << " fullname:" << kernel->fullname_with_scope();
 }

@ -114,12 +122,12 @@ void KernelDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_i
  dump_info->set_dump_path(dump_path_);
  dump_info->set_model_name(net_name_);
  dump_info->set_dump_step(iteration_);
-  auto graph_id = AnfAlgo::GetGraphId(kernel.get());
-  dump_info->set_model_id(graph_id);
-  dump_info->set_flag(kAicpuLoadFlag);
-
  FuncGraphPtr f_graph = kernel->func_graph();
  auto kernel_graph_ = f_graph->cast<KernelGraphPtr>();
+  auto root_graph_id = kernel_graph_->root_graph_id();
+  dump_info->set_model_id(root_graph_id);
+  dump_info->set_flag(kAicpuLoadFlag);
+
  auto input_ctrl_tensors = kernel_graph_->device_loop_control_tensors();
  if (input_ctrl_tensors.size() > 0) {
    auto kCurLoopCountName = "current_loop_count";
@ -225,7 +233,6 @@ void KernelDumper::ExecutorDumpOp(const aicpu::dump::OpMappingInfo &op_mapping_i
    MS_LOG(ERROR) << "[KernelDumper] Call rt api rtCpuKernelLaunch Failed, rt_ret = " << rt_ret;
    return;
  }
-  rtStreamSynchronize(stream_);
 }

 void KernelDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) {
@ -375,7 +382,6 @@ void KernelDumper::MallocP2PDebugMem(const void *const op_debug_addr) {
 }

 void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
-  std::lock_guard<std::mutex> lock(register_mutex_);
  uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode();
  auto iter = kOverflowModeStr.find(op_debug_mode);
  if (iter == kOverflowModeStr.end()) {
@ -384,6 +390,7 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
  if (op_debug_mode == kNoOverflow) {
    return;
  }
+  dumper_mutex_.lock();
  auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
  if (stream == nullptr) {
    stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
@ -391,6 +398,8 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
  if (KernelDumper::op_debug_tasks.find(stream) != KernelDumper::op_debug_tasks.end()) {
    return;
  } else {
+    std::string stream_id = std::to_string(AnfAlgo::GetStreamId(kernel));
+    KernelDumper::stream_task_graphs.emplace(stream_id, "KernelDumper");
    auto graph_id = AnfAlgo::GetGraphId(kernel.get());
    if (KernelDumper::is_data_map.find(graph_id) != KernelDumper::is_data_map.end()) {
      return;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h
@ -72,6 +72,7 @@ class KernelDumper : public debug::OverflowDumper {
  static std::map<rtStream_t, std::unique_ptr<OpDebugTask>> op_debug_tasks;
  static std::map<uint32_t, bool> is_data_map;
  static std::map<std::string, std::string> stream_task_graphs;
+  static std::mutex dumper_mutex_;

  string dump_path_;
  string net_name_;
@ -79,7 +80,6 @@ class KernelDumper : public debug::OverflowDumper {

 private:
  // Support multi-thread.
-  static std::mutex debug_register_mutex_;
  bool load_flag_;
  uint32_t graph_id_;
  uint32_t task_id_{0U};
@ -91,7 +91,6 @@ class KernelDumper : public debug::OverflowDumper {
  void *dev_load_mem_ = nullptr;
  void *proto_dev_mem_ = nullptr;
  void *proto_size_dev_mem_ = nullptr;
-  std::mutex register_mutex_;
  std::string overflow_dump_filename = "debug_files";
  void *p2p_debug_addr_ = nullptr;
  void SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_info, const CNodePtr &kernel);
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
@ -43,6 +43,7 @@
 #include "plugin/device/ascend/hal/profiler/ascend_profiling.h"
 #include "plugin/device/ascend/hal/device/profiling/profiling_manager.h"
 #include "plugin/device/ascend/hal/device/dump/ascend_dump.h"
+#include "debug/data_dump/overflow_dumper.h"

 using Adx::AdxRegDumpProcessCallBack;
 using mindspore::device::ascend::ProfilingManager;
@ -370,7 +371,13 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
    stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
  }
  MS_EXCEPTION_IF_NULL(stream);
-
+#ifdef ENABLE_DEBUGGER
+  if (DumpJsonParser::GetInstance().async_dump_enabled()) {
+    auto register_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
+    register_dumper->Init();
+    register_dumper->OpDebugRegisterForStream(kernel);
+  }
+#endif
  bool is_dynamic_shape = common::AnfAlgo::IsDynamicShape(kernel);
  if (!is_dynamic_shape || !(common::AnfAlgo::GetBooleanAttr(kernel, kAttrMSFunction))) {
    auto iter = node_atomics_persistent_cache_.find(kernel);
@ -399,6 +406,12 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
      return false;
    }
  }
+#ifdef ENABLE_DEBUGGER
+  if (DumpJsonParser::GetInstance().async_dump_enabled()) {
+    auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
+    kernel_dumper->OpLoadDumpInfo(kernel);
+  }
+#endif
 #ifndef ENABLE_SECURITY
  auto ascend_instance = profiler::ascend::AscendProfiler::GetInstance();
  MS_EXCEPTION_IF_NULL(ascend_instance);
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc
@ -24,7 +24,6 @@
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/cpu_e2e_dump.h"
 #include "debug/data_dump/e2e_dump.h"
-#include "debug/data_dump/overflow_dumper.h"
 #include "utils/ms_context.h"
 #endif
 #ifdef ENABLE_DEBUGGER
@ -80,18 +79,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
 #endif
  } else if (device_context->GetDeviceType() == device::DeviceType::kAscend) {
 #ifdef ENABLE_DEBUGGER
-#ifndef ENABLE_SECURITY
-    auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
-    graph_id_sets_.insert(kernel_graph->graph_id());
-    if (DumpJsonParser::GetInstance().async_dump_enabled()) {
-      auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
-      kernel_dumper->Init();
-      kernel_dumper->OpDebugRegisterForStream(cnode);
-      kernel_dumper->OpLoadDumpInfo(cnode);
-    }
-#endif
    auto debugger = Debugger::GetInstance();
    if (debugger != nullptr) {
+      auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
      debugger->InsertExecutedGraph(kernel_graph);
      debugger->SetAscendKernelByKernelFlag(true);
      bool read_data = CheckReadData(cnode);
@ -186,7 +176,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
        return kernel->fullname_with_scope().find("InitDataSetQueue") != std::string::npos;
      });
    }
-    if (!is_data_map_ && !graphs[0]->is_graph_run_mode()) {
+    if (!is_data_map_) {
      auto kCurLoopCountName = "current_loop_count";
      for (size_t i = 0; i < graphs.size(); i++) {
        const auto &graph_ = graphs[i];
@ -200,7 +190,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
        }
        auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
        MS_EXCEPTION_IF_NULL(tensor);
-        auto *cur_val = static_cast<int32_t *>(tensor->data_c());
+        auto *cur_val = static_cast<int64_t *>(tensor->data_c());
        MS_EXCEPTION_IF_NULL(cur_val);
        *cur_val = current_step;
        tensor->set_sync_status(kNeedSyncHostToDevice);
@ -236,21 +226,6 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
  }
 #endif

-#ifdef ENABLE_DEBUGGER
-#ifndef ENABLE_SECURITY
-  if (DumpJsonParser::GetInstance().async_dump_enabled() && DumpJsonParser::GetInstance().op_debug_mode() > 0 &&
-      Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
-    uint32_t rank_id = Debugger::GetRankID();
-    std::set<uint32_t>::iterator graph_id_iter;
-    for (graph_id_iter = graph_id_sets_.begin(); graph_id_iter != graph_id_sets_.end(); ++graph_id_iter) {
-      auto graph_id = *graph_id_iter;
-      DeleteNoOverflowFile(rank_id, graph_id);
-    }
-    graph_id_sets_.clear();
-  }
-#endif
-#endif
-
 #ifdef ENABLE_DEBUGGER
  auto debugger = Debugger::GetInstance();
  if (debugger != nullptr) {
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h
@ -55,7 +55,7 @@ class DebugActor : public ActorBase {

  // The debug on step end.
  void DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const AID *from_aid);
-  static inline uint32_t current_step{0};
+  static inline uint64_t current_step{0};

 private:
  // class members
@ -63,7 +63,6 @@ class DebugActor : public ActorBase {

  // Support multi-thread.
  std::mutex debug_mutex_;
-  std::set<uint32_t> graph_id_sets_;
 };

 }  // namespace runtime
--- a/tests/st/dump/test_multi_root_graph_dump.py
+++ b/tests/st/dump/test_multi_root_graph_dump.py
@ -101,8 +101,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
                time.sleep(2)
        execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
        # Multi root graph script: check dump data dir and graph history files and see if iteration number is matched.
-        if device == "GPU" or os.environ.get('GRAPH_OP_RUN') == "1":
-            # In GPU or KernelByKernel, we have 4 kernel graphs folders under rank_0 dir.
+        if device == "GPU":
+            # In GPU, we have 4 kernel graphs folders under rank_0 dir.
            # In graph history dir, there are 2 files for each graph (ms_execution_order and ms_global_execution_order).
            assert len(os.listdir(dump_file_path)) == 4
            assert len(os.listdir(execution_order_path)) == 8
@ -111,7 +111,7 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
            check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3'])
            check_graph_structure(dump_file_path, execution_order_path, '3', ['5'])
        else:
-            # In Ascend Super Kernel, we have 2 root graphs folders under rank_0 dir.
+            # In Ascend, we have 2 root graphs folders under rank_0 dir.
            # In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.
            # Each graph should have 3 iterations. Each graph was executed once per epoch.
            # Graph 0 was executed in even iterations, graph 1 was executed in odd iterations.