!20544 Fix the forever loop for multigraph on gpu

Merge pull request !20544 from maning202007/master
2021-07-20 11:34:40 +00:00 · 2021-07-20 11:34:40 +00:00 · 2c36f092e3
parent 31a4c3116e 2b3d215ef8
commit 2c36f092e3
4 changed files with 29 additions and 32 deletions
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -451,7 +451,7 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
  // build kernel
  BuildKernel(root_graph);
  if (debugger_ && debugger_->partial_memory()) {
-    debugger_->PreExecute(root_graph, graph_sum_);
+    debugger_->PreExecute(root_graph);
  }
  SetSummaryNodes(root_graph.get());
  // Alloc memory for child graph's inputs
@ -540,7 +540,7 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (debugger_ && debugger_->partial_memory()) {
-    debugger_->PreExecute(graph, graph_sum_);
+    debugger_->PreExecute(graph);
  }
  if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
    MS_LOG(INFO) << "Precompile only, stop in build kernel step";
@ -588,7 +588,7 @@ bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedInser
 void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
                                    const std::vector<tensor::TensorPtr> &inputs, VectorRef *const) {
  if (debugger_) {
-    debugger_->PreExecute(kernel_graph, graph_sum_);
+    debugger_->PreExecute(kernel_graph);
  }
 #if ENABLE_CPU && ENABLE_D
  // Initialize parameter server
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -477,7 +477,7 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) {
 void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
                                 const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  if (debugger_) {
-    debugger_->PreExecute(kernel_graph, graph_sum_);
+    debugger_->PreExecute(kernel_graph);
  }

  DumpSetup(kernel_graph);
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -271,16 +271,15 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
  if (device_target_ != kGPUDevice) {
    return;
  }
-  uint32_t graph_sum = graphs.size();
  for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
    const auto &graph = graphs[graph_index];
    if (debugger_) {
-      debugger_->PreExecute(graph, graph_sum);
+      debugger_->PreExecute(graph);
    }
    DumpSetup(graph);
  }
 }
-void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
+void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  CheckDatasetSinkMode();
@ -294,10 +293,8 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
      rungraph_id_list_.push_back(graph_id);
    }
  }
-  // check and save graph_ptr, suspend if graph is new
-  MS_LOG(INFO) << "total number graph: " << graph_sum;
  // multiple graphs
-  if (graph_sum > 1) {
+  if (graph_proto_list_.size() > 1) {
    // there are more than one graphs are not dataset_graph
    if (not_dataset_graph_sum_ > 0) {
      // only try to enable debugger if they are not all dataset graphs
@ -305,32 +302,21 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
        EnableDebugger();
      }
      if (debugger_enabled_) {
-        if (graph_proto_list_.size()) {
-          // only send compiled graphs once.
-          auto dbg_graph_ptr = graph_ptr_;
-          // use current graph ptr to load parameters
-          graph_ptr_ = graph_ptr;
-          LoadParametersAndConst();
-          // revert graph ptr to original value
-          graph_ptr_ = dbg_graph_ptr;
+        // only send compiled graphs once at the initial step.
+        auto dbg_graph_ptr = graph_ptr_;
+        // use current graph ptr to load parameters
+        graph_ptr_ = graph_ptr;
+        LoadParametersAndConst();
+        // revert graph ptr to original value
+        graph_ptr_ = dbg_graph_ptr;

-          SendMultiGraphsAndSuspend(graph_proto_list_);
+        SendMultiGraphsAndSuspend(graph_proto_list_);

-          graph_proto_list_.clear();
-        } else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
-          // stop only when receive the first sub run graph for each step
-          // if we have stopped for the last kernel before, no need to stop again
-          if (pipeline::ExecutorPy::GetDebugTerminate()) {
-            return;
-          }
-          if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
-            CommandLoop();
-          }
-          debug_services_->ResetLoadedTensors();
-        }
+        graph_proto_list_.clear();
      }
    }
  } else if (graph_proto_list_.size() == 1) {
+    // single graph, and not the initial step
    if (device_target_ == kGPUDevice && num_step_ != 0) {
      if (debugger_enabled_ && !(run_level_ == "node" && suspended_at_last_kernel_)) {
        CommandLoop();
@ -342,6 +328,17 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum) {
      graph_ptr_ = nullptr;
      CheckGraphPtr(graph_ptr);
    }
+  } else if (graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice) {
+    // Multiple graph, and not the initial step,
+    // stop only when receive the first sub run graph for each step
+    // if we have stopped for the last kernel before, no need to stop again
+    if (pipeline::ExecutorPy::GetDebugTerminate()) {
+      return;
+    }
+    if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
+      CommandLoop();
+    }
+    debug_services_->ResetLoadedTensors();
  }
  // resets for the new graph
  suspended_at_last_kernel_ = 0;
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -77,7 +77,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // enable debugger
  // send graph and wait for command
  // do nothing if graph is set already
-  void PreExecute(const KernelGraphPtr &graph_ptr, uint32_t graph_sum = 1);
+  void PreExecute(const KernelGraphPtr &graph_ptr);

  // analyze tensors and wait for command
  // don't need a graph_ptr because it is saved during pre_execute