From 67bfdcab093c27445b0171e63cb891ae37677e0b Mon Sep 17 00:00:00 2001
From: maoyaomin <maoyaomin4@huawei.com>
Date: Sat, 4 Mar 2023 17:46:28 +0800
Subject: [PATCH] add pynative mode operator overflow check for dump

---
 .../ccsrc/debug/data_dump/dump_json_parser.cc |  3 --
 mindspore/ccsrc/debug/data_dump/e2e_dump.cc   |  8 +---
 mindspore/ccsrc/debug/debugger/debugger.cc    |  7 +++-
 .../ccsrc/debug/debugger/debugger_utils.cc    |  2 +-
 .../ascend/hal/device/dump/kernel_dumper.cc   | 39 ++++++++++++-------
 .../ascend/hal/device/dump/kernel_dumper.h    |  3 +-
 .../hal/hardware/ascend_kernel_executor.cc    | 15 ++++++-
 .../graph_scheduler/actor/debug_actor.cc      | 31 ++-------------
 .../graph_scheduler/actor/debug_actor.h       |  3 +-
 tests/st/dump/test_multi_root_graph_dump.py   |  6 +--
 10 files changed, 55 insertions(+), 62 deletions(-)
diff --git a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
index b67de7d2447..99c5388a6b4 100644
--- a/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump/dump_json_parser.cc
@@ -88,9 +88,6 @@ bool DumpJsonParser::IsDumpEnabled() {
 
   auto context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context);
-  if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
-    MS_LOG(EXCEPTION) << "Dump is disabled in PyNative mode. Please set mode to GRAPH_MODE in context.";
-  }
   return true;
 }
 
diff --git a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
index 8dcb6af2701..050bdea5f23 100644
--- a/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/data_dump/e2e_dump.cc
@@ -555,12 +555,8 @@ void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
     return;
   }
   std::string execution_order_path = json_parser.path() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
-  std::string graph_str;
-  if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
-    graph_str = std::to_string(graph->graph_id());
-  } else {
-    graph_str = IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
-  }
+  std::string graph_str =
+    IsDeviceTargetGPU() ? std::to_string(graph->graph_id()) : std::to_string(graph->root_graph_id());
   std::string file_name_to_check = execution_order_path + "/ms_global_execution_order_graph_" + graph_str + ".csv";
   auto real_path = Common::CreatePrefixPath(file_name_to_check);
   if (!real_path.has_value()) {
diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc
index 8e920826e43..df91acf1ac0 100644
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@@ -513,7 +513,12 @@ void Debugger::DumpParamsAndConstAndHistory() {
   for (auto kernel_graph = executed_graph_ptr_set_.cbegin(); kernel_graph != executed_graph_ptr_set_.cend();
        ++kernel_graph) {
     // Dump graph run hisotry for each graph.
-    E2eDump::DumpRunIter(*kernel_graph, GetRankID());
+    if (Debugger::GetInstance()->GetAscendKernelByKernelFlag() &&
+        (*kernel_graph)->graph_id() != (*kernel_graph)->root_graph_id()) {
+      MS_LOG(INFO) << "current graph graph_id = " << (*kernel_graph)->graph_id() << " is not root graph.";
+    } else {
+      E2eDump::DumpRunIter(*kernel_graph, GetRankID());
+    }
   }
   if (!cur_root_graph_checked) {
     visited_root_graph_ids_.push_back(cur_root_graph_id_);
diff --git a/mindspore/ccsrc/debug/debugger/debugger_utils.cc b/mindspore/ccsrc/debug/debugger/debugger_utils.cc
index d42923d68bc..8273dff40a4 100644
--- a/mindspore/ccsrc/debug/debugger/debugger_utils.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger_utils.cc
@@ -215,7 +215,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
     MS_EXCEPTION_IF_NULL(kernel_graph);
     auto graph_id = kernel_graph->graph_id();
     // for GPU, nodes are dumped in graph_id directory.
-    if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
+    if (IsDeviceTargetGPU()) {
       debugger->DumpSingleNode(cnode, graph_id);
     } else {
       // for Ascend, node are dumped in root_graph_id directory.
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc
index 84fd30e913f..e263e0e5215 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.cc
@@ -16,6 +16,7 @@
 
 #include "plugin/device/ascend/hal/device/dump/kernel_dumper.h"
 #include <algorithm>
+#include <utility>
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/dump_json_parser.h"
 #endif
@@ -44,7 +45,7 @@ static constexpr uint64_t kOpDebugMemorySize = 2048;
 const size_t kDebugP2pSize = 8UL;
 }  // namespace
 DUMPER_REG(kAscendDevice, KernelDumper);
-std::mutex KernelDumper::debug_register_mutex_;
+std::mutex KernelDumper::dumper_mutex_;
 std::map<rtStream_t, std::unique_ptr<OpDebugTask>> KernelDumper::op_debug_tasks;
 std::map<uint32_t, bool> KernelDumper::is_data_map;
 std::map<std::string, std::string> KernelDumper::stream_task_graphs;
@@ -80,9 +81,17 @@ KernelDumper::~KernelDumper() {
 }
 
 void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
-  std::lock_guard<std::mutex> lock(debug_register_mutex_);
-  aicpu::dump::OpMappingInfo dump_info;
-  SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
+  auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
+  if (stream == nullptr) {
+    stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
+  }
+  if (DumpJsonParser::GetInstance().op_debug_mode() > 0) {
+    auto rt_ret = rtStreamSynchronize(stream);
+    dumper_mutex_.unlock();
+    if (rt_ret != ACL_ERROR_RT_AICORE_OVER_FLOW) {
+      return;
+    }
+  }
 
   if (!KernelNeedDump(kernel)) {
     return;
@@ -91,10 +100,9 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
     MS_LOG(WARNING) << "[KernelDumper] kernel [" << kernel->UniqueName() << "] is a non-task node, skip dump.";
     return;
   }
-  auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
-  if (stream == nullptr) {
-    stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
-  }
+  aicpu::dump::OpMappingInfo dump_info;
+  SetOpMappingInfo(NOT_NULL(&dump_info), kernel);
+
   DumpJsonParser::GetInstance().MatchKernel(kernel->fullname_with_scope());
   aicpu::dump::Task task;
   ConstructDumpTask(NOT_NULL(kernel), NOT_NULL(&task));
@@ -105,7 +113,7 @@ void KernelDumper::OpLoadDumpInfo(const CNodePtr &kernel) {
   graph_id_ = AnfAlgo::GetGraphId(kernel.get());
   std::string stream_task_id = std::to_string(stream_id_) + std::to_string(task_id_);
   KernelDumper::stream_task_graphs.emplace(stream_task_id, kernel->fullname_with_scope());
-  MS_LOG(INFO) << "[DataDump] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
+  MS_LOG(INFO) << "[KernelDumper] Get runtime info graph_id:" << graph_id_ << " stream_id:" << stream_id_
                << " task_id:" << task_id_ << " fullname:" << kernel->fullname_with_scope();
 }
 
@@ -114,12 +122,12 @@ void KernelDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_i
   dump_info->set_dump_path(dump_path_);
   dump_info->set_model_name(net_name_);
   dump_info->set_dump_step(iteration_);
-  auto graph_id = AnfAlgo::GetGraphId(kernel.get());
-  dump_info->set_model_id(graph_id);
-  dump_info->set_flag(kAicpuLoadFlag);
-
   FuncGraphPtr f_graph = kernel->func_graph();
   auto kernel_graph_ = f_graph->cast<KernelGraphPtr>();
+  auto root_graph_id = kernel_graph_->root_graph_id();
+  dump_info->set_model_id(root_graph_id);
+  dump_info->set_flag(kAicpuLoadFlag);
+
   auto input_ctrl_tensors = kernel_graph_->device_loop_control_tensors();
   if (input_ctrl_tensors.size() > 0) {
     auto kCurLoopCountName = "current_loop_count";
@@ -225,7 +233,6 @@ void KernelDumper::ExecutorDumpOp(const aicpu::dump::OpMappingInfo &op_mapping_i
     MS_LOG(ERROR) << "[KernelDumper] Call rt api rtCpuKernelLaunch Failed, rt_ret = " << rt_ret;
     return;
   }
-  rtStreamSynchronize(stream_);
 }
 
 void KernelDumper::ConstructDumpTask(NotNull<const CNodePtr &> kernel, NotNull<aicpu::dump::Task *> dump_task) {
@@ -375,7 +382,6 @@ void KernelDumper::MallocP2PDebugMem(const void *const op_debug_addr) {
 }
 
 void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
-  std::lock_guard<std::mutex> lock(register_mutex_);
   uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode();
   auto iter = kOverflowModeStr.find(op_debug_mode);
   if (iter == kOverflowModeStr.end()) {
@@ -384,6 +390,7 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
   if (op_debug_mode == kNoOverflow) {
     return;
   }
+  dumper_mutex_.lock();
   auto stream = AscendStreamMng::GetInstance().GetStream(AnfAlgo::GetStreamId(kernel));
   if (stream == nullptr) {
     stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
@@ -391,6 +398,8 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
   if (KernelDumper::op_debug_tasks.find(stream) != KernelDumper::op_debug_tasks.end()) {
     return;
   } else {
+    std::string stream_id = std::to_string(AnfAlgo::GetStreamId(kernel));
+    KernelDumper::stream_task_graphs.emplace(stream_id, "KernelDumper");
     auto graph_id = AnfAlgo::GetGraphId(kernel.get());
     if (KernelDumper::is_data_map.find(graph_id) != KernelDumper::is_data_map.end()) {
       return;
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h
index db504607c03..a64550951b0 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/dump/kernel_dumper.h
@@ -72,6 +72,7 @@ class KernelDumper : public debug::OverflowDumper {
   static std::map<rtStream_t, std::unique_ptr<OpDebugTask>> op_debug_tasks;
   static std::map<uint32_t, bool> is_data_map;
   static std::map<std::string, std::string> stream_task_graphs;
+  static std::mutex dumper_mutex_;
 
   string dump_path_;
   string net_name_;
@@ -79,7 +80,6 @@ class KernelDumper : public debug::OverflowDumper {
 
  private:
   // Support multi-thread.
-  static std::mutex debug_register_mutex_;
   bool load_flag_;
   uint32_t graph_id_;
   uint32_t task_id_{0U};
@@ -91,7 +91,6 @@ class KernelDumper : public debug::OverflowDumper {
   void *dev_load_mem_ = nullptr;
   void *proto_dev_mem_ = nullptr;
   void *proto_size_dev_mem_ = nullptr;
-  std::mutex register_mutex_;
   std::string overflow_dump_filename = "debug_files";
   void *p2p_debug_addr_ = nullptr;
   void SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_info, const CNodePtr &kernel);
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
index 7954081c282..b8c3778d166 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
@@ -43,6 +43,7 @@
 #include "plugin/device/ascend/hal/profiler/ascend_profiling.h"
 #include "plugin/device/ascend/hal/device/profiling/profiling_manager.h"
 #include "plugin/device/ascend/hal/device/dump/ascend_dump.h"
+#include "debug/data_dump/overflow_dumper.h"
 
 using Adx::AdxRegDumpProcessCallBack;
 using mindspore::device::ascend::ProfilingManager;
@@ -370,7 +371,13 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
     stream = AscendStreamMng::GetInstance().GetStream(kDefaultStreamIndex);
   }
   MS_EXCEPTION_IF_NULL(stream);
-
+#ifdef ENABLE_DEBUGGER
+  if (DumpJsonParser::GetInstance().async_dump_enabled()) {
+    auto register_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
+    register_dumper->Init();
+    register_dumper->OpDebugRegisterForStream(kernel);
+  }
+#endif
   bool is_dynamic_shape = common::AnfAlgo::IsDynamicShape(kernel);
   if (!is_dynamic_shape || !(common::AnfAlgo::GetBooleanAttr(kernel, kAttrMSFunction))) {
     auto iter = node_atomics_persistent_cache_.find(kernel);
@@ -399,6 +406,12 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
       return false;
     }
   }
+#ifdef ENABLE_DEBUGGER
+  if (DumpJsonParser::GetInstance().async_dump_enabled()) {
+    auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
+    kernel_dumper->OpLoadDumpInfo(kernel);
+  }
+#endif
 #ifndef ENABLE_SECURITY
   auto ascend_instance = profiler::ascend::AscendProfiler::GetInstance();
   MS_EXCEPTION_IF_NULL(ascend_instance);
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc
index 54f29f81ed2..28bb55d3f28 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.cc
@@ -24,7 +24,6 @@
 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/cpu_e2e_dump.h"
 #include "debug/data_dump/e2e_dump.h"
-#include "debug/data_dump/overflow_dumper.h"
 #include "utils/ms_context.h"
 #endif
 #ifdef ENABLE_DEBUGGER
@@ -80,18 +79,9 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
 #endif
   } else if (device_context->GetDeviceType() == device::DeviceType::kAscend) {
 #ifdef ENABLE_DEBUGGER
-#ifndef ENABLE_SECURITY
-    auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
-    graph_id_sets_.insert(kernel_graph->graph_id());
-    if (DumpJsonParser::GetInstance().async_dump_enabled()) {
-      auto kernel_dumper = debug::OverflowDumper::GetInstance(kAscendDevice);
-      kernel_dumper->Init();
-      kernel_dumper->OpDebugRegisterForStream(cnode);
-      kernel_dumper->OpLoadDumpInfo(cnode);
-    }
-#endif
     auto debugger = Debugger::GetInstance();
     if (debugger != nullptr) {
+      auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
       debugger->InsertExecutedGraph(kernel_graph);
       debugger->SetAscendKernelByKernelFlag(true);
       bool read_data = CheckReadData(cnode);
@@ -186,7 +176,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
         return kernel->fullname_with_scope().find("InitDataSetQueue") != std::string::npos;
       });
     }
-    if (!is_data_map_ && !graphs[0]->is_graph_run_mode()) {
+    if (!is_data_map_) {
       auto kCurLoopCountName = "current_loop_count";
       for (size_t i = 0; i < graphs.size(); i++) {
         const auto &graph_ = graphs[i];
@@ -200,7 +190,7 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
         }
         auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
         MS_EXCEPTION_IF_NULL(tensor);
-        auto *cur_val = static_cast<int32_t *>(tensor->data_c());
+        auto *cur_val = static_cast<int64_t *>(tensor->data_c());
         MS_EXCEPTION_IF_NULL(cur_val);
         *cur_val = current_step;
         tensor->set_sync_status(kNeedSyncHostToDevice);
@@ -236,21 +226,6 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
   }
 #endif
 
-#ifdef ENABLE_DEBUGGER
-#ifndef ENABLE_SECURITY
-  if (DumpJsonParser::GetInstance().async_dump_enabled() && DumpJsonParser::GetInstance().op_debug_mode() > 0 &&
-      Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
-    uint32_t rank_id = Debugger::GetRankID();
-    std::set<uint32_t>::iterator graph_id_iter;
-    for (graph_id_iter = graph_id_sets_.begin(); graph_id_iter != graph_id_sets_.end(); ++graph_id_iter) {
-      auto graph_id = *graph_id_iter;
-      DeleteNoOverflowFile(rank_id, graph_id);
-    }
-    graph_id_sets_.clear();
-  }
-#endif
-#endif
-
 #ifdef ENABLE_DEBUGGER
   auto debugger = Debugger::GetInstance();
   if (debugger != nullptr) {
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h
index 71fa4e09652..c9ed8836bbb 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/debug_actor.h
@@ -55,7 +55,7 @@ class DebugActor : public ActorBase {
 
   // The debug on step end.
   void DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const AID *from_aid);
-  static inline uint32_t current_step{0};
+  static inline uint64_t current_step{0};
 
  private:
   // class members
@@ -63,7 +63,6 @@ class DebugActor : public ActorBase {
 
   // Support multi-thread.
   std::mutex debug_mutex_;
-  std::set<uint32_t> graph_id_sets_;
 };
 
 }  // namespace runtime
diff --git a/tests/st/dump/test_multi_root_graph_dump.py b/tests/st/dump/test_multi_root_graph_dump.py
index 81e77de6f60..805c627ce6a 100644
--- a/tests/st/dump/test_multi_root_graph_dump.py
+++ b/tests/st/dump/test_multi_root_graph_dump.py
@@ -101,8 +101,8 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
                 time.sleep(2)
         execution_order_path = os.path.join(dump_path, 'rank_0', 'execution_order')
         # Multi root graph script: check dump data dir and graph history files and see if iteration number is matched.
-        if device == "GPU" or os.environ.get('GRAPH_OP_RUN') == "1":
-            # In GPU or KernelByKernel, we have 4 kernel graphs folders under rank_0 dir.
+        if device == "GPU":
+            # In GPU, we have 4 kernel graphs folders under rank_0 dir.
             # In graph history dir, there are 2 files for each graph (ms_execution_order and ms_global_execution_order).
             assert len(os.listdir(dump_file_path)) == 4
             assert len(os.listdir(execution_order_path)) == 8
@@ -111,7 +111,7 @@ def run_multi_root_graph_dump(device, dump_mode, test_name):
             check_graph_structure(dump_file_path, execution_order_path, '2', ['1', '3'])
             check_graph_structure(dump_file_path, execution_order_path, '3', ['5'])
         else:
-            # In Ascend Super Kernel, we have 2 root graphs folders under rank_0 dir.
+            # In Ascend, we have 2 root graphs folders under rank_0 dir.
             # In graph history dir, there are 4 ms_execution_order files and 2 ms_global_execution_order files.
             # Each graph should have 3 iterations. Each graph was executed once per epoch.
             # Graph 0 was executed in even iterations, graph 1 was executed in odd iterations.