!8969 [bug_fix]GPU distributed training occur core dump when memory is not enough

From: @zyli2020 Reviewed-by: @limingqi107,@cristoval Signed-off-by: @cristoval
2020-11-25 11:18:25 +08:00 · 2020-11-25 11:18:25 +08:00 · ddff3c4277
parent 2fb5ab631d 6f6a0dfd7a
commit ddff3c4277
3 changed files with 25 additions and 17 deletions
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc
@ -346,7 +346,6 @@ bool MemSwapManager::RetreatSwapInfo() {
    ResetSwapInfo();
    RetreatSwapThreshold();
    if (tensor_size_threshold_idx_ == ordered_tensors_.size() - 1 && distance_threshold_ < kDistanceLowerBound) {
-      MS_LOG(ERROR) << "Retreat swap info failed";
      return false;
    }
  } else {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -310,6 +310,13 @@ void GPUKernelRuntime::AllocInplaceNodeMemory(const session::KernelGraph *graph)
  }
 }

+bool GPUKernelRuntime::IsDistributedTraining(const session::KernelGraph *graph) {
+  MS_EXCEPTION_IF_NULL(graph);
+  const auto &kernels = graph->execution_order();
+  return std::any_of(kernels.begin(), kernels.end(),
+                     [](const AnfNodePtr &kernel) { return AnfAlgo::IsCommunicationOp(kernel); });
+}
+
 void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
@ -367,28 +374,28 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) {
 }

 bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) {
-  bool ret = true;
  auto graph_id = graph->graph_id();
  if (!is_first_step_map_[graph_id] || graph->is_dynamic_shape()) {
    // Normally run graph
-    ret = LaunchKernelDynamic(graph);
-  } else {
-    // Mock run first step
-    ret = LaunchKernelDynamic(graph, true, false);
-    if (ret) {
-      // Normally run graph
-      ret = LaunchKernelDynamic(graph);
-    } else {
-      // Trigger memory swap
-      ret = SearchMemSwapScheme(graph);
-    }
-    is_first_step_map_[graph_id] = false;
+    return LaunchKernelDynamic(graph);
  }
-  return ret;
+  is_first_step_map_[graph_id] = false;
+  // Mock run first step
+  bool ret = LaunchKernelDynamic(graph, true, false);
+  if (ret) {
+    // Normally run graph
+    return LaunchKernelDynamic(graph);
+  }
+  if (IsDistributedTraining(graph)) {
+    MS_LOG(ERROR) << "Device memory is not enough, run graph failed!";
+    return false;
+  }
+  // Trigger memory swap
+  return SearchMemSwapScheme(graph);
 }

 bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
-  MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
+  MS_LOG(INFO) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
  bool ret = false;
  ClearKernelOldOutputAndWorkspace(graph);
  if (!mem_swap_manager_->mem_swap_init()) {
@ -399,6 +406,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {

  while (!ret) {
    if (!mem_swap_manager_->RetreatSwapInfo()) {
+      MS_LOG(ERROR) << "Device memory is not enough, run graph failed!";
      return false;
    }
    ret = LaunchKernelDynamic(graph, true, false);
@ -417,7 +425,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) {
 }

 bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) {
-  MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment.";
+  MS_LOG(INFO) << "Refine memory swap scheme, it may take some time, please wait a moment.";
  auto &kernels = graph->execution_order();
  for (const auto &kernel : kernels) {
    if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@ -98,6 +98,7 @@ class GPUKernelRuntime : public KernelRuntime {
  void UpdateHostSwapOutQueue(bool mock);
  void ClearSwapInfo(bool mock);
  void AllocInplaceNodeMemory(const session::KernelGraph *graph);
+  bool IsDistributedTraining(const session::KernelGraph *graph);

  DeviceAddressPtr GetPrevNodeMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node);
  DeviceAddressPtr GetMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node);