forked from mindspore-Ecosystem/mindspore
!45817 Init Hccl of GE
Merge pull request !45817 from jiaorui/ge-hccl-init
This commit is contained in:
commit
4ea8e84b1e
|
@ -1663,15 +1663,18 @@ void ResetOpIdWithOffset() { mindspore::id_generator::reset_id_with_offset(); }
|
|||
void InitHccl() {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
ms_context->set_param<bool>(MS_CTX_ENABLE_HCCL, true);
|
||||
#ifdef WITH_BACKEND
|
||||
auto backend = ms_context->backend_policy();
|
||||
if (backend == "ge") {
|
||||
if (!mindspore::distributed::Initialize()) {
|
||||
MS_LOG(EXCEPTION) << "InitHccl failed.";
|
||||
}
|
||||
InitPipeline();
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
mindspore::python_adapter::set_python_env_flag(true);
|
||||
ms_context->set_param<bool>(MS_CTX_ENABLE_HCCL, true);
|
||||
std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
if (ms_context->backend_policy() == "ms" && device_name == kAscendDevice) {
|
||||
if (!mindspore::distributed::Initialize()) {
|
||||
|
|
|
@ -444,15 +444,6 @@ void GeDeviceContext::Initialize() {
|
|||
(void)rtGetIsHeterogenous(&is_heterogenous);
|
||||
ms_context->set_param<bool>(MS_CTX_ENABLE_GE_HETEROGENOUS, is_heterogenous == 1);
|
||||
InitGe(ms_context);
|
||||
std::string rank_id = common::GetEnv("RANK_ID");
|
||||
std::string rank_table_file = common::GetEnv("RANK_TABLE_FILE");
|
||||
if (!rank_id.empty() && !rank_table_file.empty()) {
|
||||
MsContext::GetInstance()->set_param<bool>(MS_CTX_ENABLE_HCCL, true);
|
||||
if (!mindspore::distributed::Initialize()) {
|
||||
MS_LOG(EXCEPTION) << "InitHccl failed.";
|
||||
}
|
||||
}
|
||||
|
||||
initialized_ = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "runtime/device/memory_manager.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "include/transform/graph_ir/types.h"
|
||||
#include "plugin/device/ascend/hal/hardware/ascend_collective_comm_lib.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
|
@ -45,6 +46,11 @@ class GeDeviceResManager : public DeviceResManager {
|
|||
|
||||
static void CreateSessionAndGraphRunner(bool is_training);
|
||||
|
||||
bool LoadCollectiveCommLib() override {
|
||||
collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Relevant function to allocate and free device memory of raw ptr.
|
||||
void *AllocateMemory(size_t size) const override;
|
||||
void FreeMemory(void *ptr) const override;
|
||||
|
|
|
@ -79,7 +79,7 @@ void HcclAdapter::InitPlugin() {
|
|||
return;
|
||||
}
|
||||
|
||||
plugin_handle_ = dlopen(kHcclPluginFileName, RTLD_NOW | RTLD_LOCAL);
|
||||
plugin_handle_ = dlopen(kHcclPluginFileName, RTLD_DEEPBIND | RTLD_NOW | RTLD_LOCAL);
|
||||
if (plugin_handle_ == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Dlopen " << kHcclPluginFileName << " failed, result = " << GetDlErrorMsg();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue