From 9a42d9223c7d16a3b89d779a0b6f2c0fabd9cde2 Mon Sep 17 00:00:00 2001 From: baihuawei <baihuawei@huawei.com> Date: Mon, 14 Nov 2022 16:46:46 +0800 Subject: [PATCH] fix init hccl on serving scene and exit abort --- mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc | 5 +++++ .../ccsrc/distributed/collective/collective_manager.cc | 3 ++- .../ascend/hal/hardware/ascend_communication_group.cc | 6 ++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc b/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc index 92c75942eae..9f213b45a03 100644 --- a/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc +++ b/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc @@ -28,6 +28,7 @@ #include "runtime/dev.h" #include "include/common/utils/python_adapter.h" #include "runtime/hardware/device_context_manager.h" +#include "distributed/init.h" namespace mindspore { API_GRAPH_REG(kAscendDevice, AscendGraphImpl); @@ -39,6 +40,10 @@ void InitHccl() { auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); mindspore::python_adapter::set_python_env_flag(true); + // init hccl from distributed + if (!mindspore::distributed::Initialize()) { + MS_LOG(EXCEPTION) << "InitHccl failed."; + } uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); if (ms_context->backend_policy() == "ms") { auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id); diff --git a/mindspore/ccsrc/distributed/collective/collective_manager.cc b/mindspore/ccsrc/distributed/collective/collective_manager.cc index cd62f6a593d..94e4baf3148 100644 --- a/mindspore/ccsrc/distributed/collective/collective_manager.cc +++ b/mindspore/ccsrc/distributed/collective/collective_manager.cc @@ -270,7 +270,8 @@ bool CollectiveManager::CreateCommunicationGroup(const std::string &group_name, // Timeout limit 600 seconds to wait finish initializing device communication group. const int64_t kTimeToWait = 600; // Initialize communication group on the device side in thread with timeout limit. - ret = ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait); + MS_EXCEPTION_IF_CHECK_FAIL(ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait), + "Create group" + group_name + "failed."); MS_LOG(INFO) << "End initialize communication group on the device side."; return ret; } diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc index 9070e7095b4..d2bac3e44b1 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_communication_group.cc @@ -34,8 +34,10 @@ bool AscendCommunicationGroup::Initialize(void *root_info) { unique_id_ = *(static_cast<HcclRootInfo *>(root_info)); uint32_t group_rank = GetGroupRank(global_rank_); - CHECK_RET(HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_), - static_cast<int32_t>(HCCL_SUCCESS), "Initializing HCCL communicator failed."); + RETURN_IF_FALSE_WITH_LOG( + HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_) == + static_cast<int32_t>(HCCL_SUCCESS), + "Initializing HCCL communicator failed."); initialized_ = true; return true; }