From e3b1fb76bcbe1174033809c089d62ba6cb2dc694 Mon Sep 17 00:00:00 2001 From: lizhenyu Date: Mon, 28 Mar 2022 22:35:01 +0800 Subject: [PATCH] [Bugfix] get ckpt failed when call init nccl twice --- mindspore/ccsrc/distributed/collective/collective_manager.h | 3 +++ mindspore/ccsrc/distributed/init.cc | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mindspore/ccsrc/distributed/collective/collective_manager.h b/mindspore/ccsrc/distributed/collective/collective_manager.h index 7b9003fe617..519fa334a6f 100644 --- a/mindspore/ccsrc/distributed/collective/collective_manager.h +++ b/mindspore/ccsrc/distributed/collective/collective_manager.h @@ -74,6 +74,9 @@ class BACKEND_EXPORT CollectiveManager { // Get whether need reinitialize collective communication. bool need_reinit() const { return need_reinit_.load(); } + // Return collective manager is initialized. + bool initialized() const { return inited_.load(); } + private: CollectiveManager(); diff --git a/mindspore/ccsrc/distributed/init.cc b/mindspore/ccsrc/distributed/init.cc index b84c05d9d9f..d151219d661 100644 --- a/mindspore/ccsrc/distributed/init.cc +++ b/mindspore/ccsrc/distributed/init.cc @@ -30,7 +30,7 @@ bool Initialize() { } #if ((defined ENABLE_CPU) && (!defined _WIN32)) - if (cluster::ClusterContext::instance()->initialized()) { + if (cluster::ClusterContext::instance()->initialized() && !collective::CollectiveManager::instance()->initialized()) { // Server and Scheduler don't use collective communication library. auto node = cluster::ClusterContext::instance()->node(); MS_EXCEPTION_IF_NULL(node);