!32134 [Bugfix] get ckpt failed when call init nccl twice

Merge pull request !32134 from zyli2020/master
This commit is contained in:
i-robot 2022-03-29 07:05:41 +00:00 committed by Gitee
commit 6488b0612a
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
2 changed files with 4 additions and 1 deletions

View File

@ -74,6 +74,9 @@ class BACKEND_EXPORT CollectiveManager {
// Get whether need reinitialize collective communication.
bool need_reinit() const { return need_reinit_.load(); }
// Return collective manager is initialized.
bool initialized() const { return inited_.load(); }
private:
CollectiveManager();

View File

@ -30,7 +30,7 @@ bool Initialize() {
}
#if ((defined ENABLE_CPU) && (!defined _WIN32))
if (cluster::ClusterContext::instance()->initialized()) {
if (cluster::ClusterContext::instance()->initialized() && !collective::CollectiveManager::instance()->initialized()) {
// Server and Scheduler don't use collective communication library.
auto node = cluster::ClusterContext::instance()->node();
MS_EXCEPTION_IF_NULL(node);