From 9a42d9223c7d16a3b89d779a0b6f2c0fabd9cde2 Mon Sep 17 00:00:00 2001
From: baihuawei <baihuawei@huawei.com>
Date: Mon, 14 Nov 2022 16:46:46 +0800
Subject: [PATCH] fix init hccl on serving scene and exit abort

---
 mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc   | 5 +++++
 .../ccsrc/distributed/collective/collective_manager.cc      | 3 ++-
 .../ascend/hal/hardware/ascend_communication_group.cc       | 6 ++++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc b/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc
index 92c75942eae..9f213b45a03 100644
--- a/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc
+++ b/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc
@@ -28,6 +28,7 @@
 #include "runtime/dev.h"
 #include "include/common/utils/python_adapter.h"
 #include "runtime/hardware/device_context_manager.h"
+#include "distributed/init.h"
 
 namespace mindspore {
 API_GRAPH_REG(kAscendDevice, AscendGraphImpl);
@@ -39,6 +40,10 @@ void InitHccl() {
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
   mindspore::python_adapter::set_python_env_flag(true);
+  // init hccl from distributed
+  if (!mindspore::distributed::Initialize()) {
+    MS_LOG(EXCEPTION) << "InitHccl failed.";
+  }
   uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
   if (ms_context->backend_policy() == "ms") {
     auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
diff --git a/mindspore/ccsrc/distributed/collective/collective_manager.cc b/mindspore/ccsrc/distributed/collective/collective_manager.cc
index cd62f6a593d..94e4baf3148 100644
--- a/mindspore/ccsrc/distributed/collective/collective_manager.cc
+++ b/mindspore/ccsrc/distributed/collective/collective_manager.cc
@@ -270,7 +270,8 @@ bool CollectiveManager::CreateCommunicationGroup(const std::string &group_name,
   // Timeout limit 600 seconds to wait finish initializing device communication group.
   const int64_t kTimeToWait = 600;
   // Initialize communication group on the device side in thread with timeout limit.
-  ret = ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait);
+  MS_EXCEPTION_IF_CHECK_FAIL(ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait),
+                             "Create group" + group_name + "failed.");
   MS_LOG(INFO) << "End initialize communication group on the device side.";
   return ret;
 }
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc
index 9070e7095b4..d2bac3e44b1 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc
@@ -34,8 +34,10 @@ bool AscendCommunicationGroup::Initialize(void *root_info) {
 
   unique_id_ = *(static_cast<HcclRootInfo *>(root_info));
   uint32_t group_rank = GetGroupRank(global_rank_);
-  CHECK_RET(HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_),
-            static_cast<int32_t>(HCCL_SUCCESS), "Initializing HCCL communicator failed.");
+  RETURN_IF_FALSE_WITH_LOG(
+    HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_) ==
+      static_cast<int32_t>(HCCL_SUCCESS),
+    "Initializing HCCL communicator failed.");
   initialized_ = true;
   return true;
 }