!45817 Init Hccl of GE

Merge pull request !45817 from jiaorui/ge-hccl-init
This commit is contained in:
i-robot 2022-11-23 08:02:28 +00:00 committed by Gitee
commit 4ea8e84b1e
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
4 changed files with 11 additions and 11 deletions

View File

@ -1663,15 +1663,18 @@ void ResetOpIdWithOffset() { mindspore::id_generator::reset_id_with_offset(); }
void InitHccl() {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
ms_context->set_param<bool>(MS_CTX_ENABLE_HCCL, true);
#ifdef WITH_BACKEND
auto backend = ms_context->backend_policy();
if (backend == "ge") {
if (!mindspore::distributed::Initialize()) {
MS_LOG(EXCEPTION) << "InitHccl failed.";
}
InitPipeline();
return;
}
#endif
mindspore::python_adapter::set_python_env_flag(true);
ms_context->set_param<bool>(MS_CTX_ENABLE_HCCL, true);
std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
if (ms_context->backend_policy() == "ms" && device_name == kAscendDevice) {
if (!mindspore::distributed::Initialize()) {

View File

@ -444,15 +444,6 @@ void GeDeviceContext::Initialize() {
(void)rtGetIsHeterogenous(&is_heterogenous);
ms_context->set_param<bool>(MS_CTX_ENABLE_GE_HETEROGENOUS, is_heterogenous == 1);
InitGe(ms_context);
std::string rank_id = common::GetEnv("RANK_ID");
std::string rank_table_file = common::GetEnv("RANK_TABLE_FILE");
if (!rank_id.empty() && !rank_table_file.empty()) {
MsContext::GetInstance()->set_param<bool>(MS_CTX_ENABLE_HCCL, true);
if (!mindspore::distributed::Initialize()) {
MS_LOG(EXCEPTION) << "InitHccl failed.";
}
}
initialized_ = true;
}

View File

@ -25,6 +25,7 @@
#include "runtime/device/memory_manager.h"
#include "utils/ms_context.h"
#include "include/transform/graph_ir/types.h"
#include "plugin/device/ascend/hal/hardware/ascend_collective_comm_lib.h"
namespace mindspore {
namespace device {
@ -45,6 +46,11 @@ class GeDeviceResManager : public DeviceResManager {
static void CreateSessionAndGraphRunner(bool is_training);
bool LoadCollectiveCommLib() override {
collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();
return true;
}
// Relevant function to allocate and free device memory of raw ptr.
void *AllocateMemory(size_t size) const override;
void FreeMemory(void *ptr) const override;

View File

@ -79,7 +79,7 @@ void HcclAdapter::InitPlugin() {
return;
}
plugin_handle_ = dlopen(kHcclPluginFileName, RTLD_NOW | RTLD_LOCAL);
plugin_handle_ = dlopen(kHcclPluginFileName, RTLD_DEEPBIND | RTLD_NOW | RTLD_LOCAL);
if (plugin_handle_ == nullptr) {
MS_LOG(EXCEPTION) << "Dlopen " << kHcclPluginFileName << " failed, result = " << GetDlErrorMsg();
}