!45488 fix init hccl on serving and exit abort bug

Merge pull request !45488 from baihuawei/fix_dynamic_cluster_on_serving_and_exit_hanging
This commit is contained in:
i-robot 2022-11-15 12:35:21 +00:00 committed by Gitee
commit de31b33648
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
3 changed files with 11 additions and 3 deletions

View File

@ -28,6 +28,7 @@
#include "runtime/dev.h"
#include "include/common/utils/python_adapter.h"
#include "runtime/hardware/device_context_manager.h"
#include "distributed/init.h"
namespace mindspore {
API_GRAPH_REG(kAscendDevice, AscendGraphImpl);
@ -39,6 +40,10 @@ void InitHccl() {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
mindspore::python_adapter::set_python_env_flag(true);
// init hccl from distributed
if (!mindspore::distributed::Initialize()) {
MS_LOG(EXCEPTION) << "InitHccl failed.";
}
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
if (ms_context->backend_policy() == "ms") {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);

View File

@ -270,7 +270,8 @@ bool CollectiveManager::CreateCommunicationGroup(const std::string &group_name,
// Timeout limit 600 seconds to wait finish initializing device communication group.
const int64_t kTimeToWait = 600;
// Initialize communication group on the device side in thread with timeout limit.
ret = ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait);
MS_EXCEPTION_IF_CHECK_FAIL(ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait),
"Create group" + group_name + "failed.");
MS_LOG(INFO) << "End initialize communication group on the device side.";
return ret;
}

View File

@ -34,8 +34,10 @@ bool AscendCommunicationGroup::Initialize(void *root_info) {
unique_id_ = *(static_cast<HcclRootInfo *>(root_info));
uint32_t group_rank = GetGroupRank(global_rank_);
CHECK_RET(HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_),
static_cast<int32_t>(HCCL_SUCCESS), "Initializing HCCL communicator failed.");
RETURN_IF_FALSE_WITH_LOG(
HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_) ==
static_cast<int32_t>(HCCL_SUCCESS),
"Initializing HCCL communicator failed.");
initialized_ = true;
return true;
}