!45488 fix init hccl on serving and exit abort bug
Merge pull request !45488 from baihuawei/fix_dynamic_cluster_on_serving_and_exit_hanging
This commit is contained in:
commit
de31b33648
|
@ -28,6 +28,7 @@
|
|||
#include "runtime/dev.h"
|
||||
#include "include/common/utils/python_adapter.h"
|
||||
#include "runtime/hardware/device_context_manager.h"
|
||||
#include "distributed/init.h"
|
||||
|
||||
namespace mindspore {
|
||||
API_GRAPH_REG(kAscendDevice, AscendGraphImpl);
|
||||
|
@ -39,6 +40,10 @@ void InitHccl() {
|
|||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
mindspore::python_adapter::set_python_env_flag(true);
|
||||
// init hccl from distributed
|
||||
if (!mindspore::distributed::Initialize()) {
|
||||
MS_LOG(EXCEPTION) << "InitHccl failed.";
|
||||
}
|
||||
uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
if (ms_context->backend_policy() == "ms") {
|
||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
|
||||
|
|
|
@ -270,7 +270,8 @@ bool CollectiveManager::CreateCommunicationGroup(const std::string &group_name,
|
|||
// Timeout limit 600 seconds to wait finish initializing device communication group.
|
||||
const int64_t kTimeToWait = 600;
|
||||
// Initialize communication group on the device side in thread with timeout limit.
|
||||
ret = ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait);
|
||||
MS_EXCEPTION_IF_CHECK_FAIL(ExecuteFuncInThread(init_device_comm_group_func, kTimeToWait),
|
||||
"Create group" + group_name + "failed.");
|
||||
MS_LOG(INFO) << "End initialize communication group on the device side.";
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -34,8 +34,10 @@ bool AscendCommunicationGroup::Initialize(void *root_info) {
|
|||
|
||||
unique_id_ = *(static_cast<HcclRootInfo *>(root_info));
|
||||
uint32_t group_rank = GetGroupRank(global_rank_);
|
||||
CHECK_RET(HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_),
|
||||
static_cast<int32_t>(HCCL_SUCCESS), "Initializing HCCL communicator failed.");
|
||||
RETURN_IF_FALSE_WITH_LOG(
|
||||
HcclCommInitRootInfo(static_cast<uint32_t>(size_), &unique_id_, static_cast<uint32_t>(group_rank), &comm_) ==
|
||||
static_cast<int32_t>(HCCL_SUCCESS),
|
||||
"Initializing HCCL communicator failed.");
|
||||
initialized_ = true;
|
||||
return true;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue